From d929456f3bf18b45a0db32e8bea2284073543644 Mon Sep 17 00:00:00 2001 From: John Stachurski Date: Mon, 9 Mar 2026 13:25:40 +1100 Subject: [PATCH 1/8] Add risk-sensitive inventory management lecture and improve inventory_q Co-Authored-By: Claude Opus 4.6 --- lectures/inventory_q.md | 61 ++- lectures/rs_inventory_q.ipynb | 977 ++++++++++++++++++++++++++++++++++ lectures/rs_inventory_q.md | 761 ++++++++++++++++++++++++++ lectures/rs_inventory_q.py | 767 ++++++++++++++++++++++++++ 4 files changed, 2541 insertions(+), 25 deletions(-) create mode 100644 lectures/rs_inventory_q.ipynb create mode 100644 lectures/rs_inventory_q.md create mode 100644 lectures/rs_inventory_q.py diff --git a/lectures/inventory_q.md b/lectures/inventory_q.md index 5640929a6..39740eb6b 100644 --- a/lectures/inventory_q.md +++ b/lectures/inventory_q.md @@ -37,7 +37,7 @@ We approach the problem in two ways. First, we solve it exactly using dynamic programming, assuming full knowledge of the model — the demand distribution, cost parameters, and transition dynamics. -Second, we show how a manager can learn the optimal policy from experience alone, using *Q-learning*. +Second, we show how a manager can learn the optimal policy from experience alone, using *[Q-learning](https://en.wikipedia.org/wiki/Q-learning)*. The manager observes only the inventory level, the order placed, the resulting profit, and the next inventory level — without knowing any of the underlying @@ -50,6 +50,12 @@ transition function. We show that, given enough experience, the manager's learned policy converges to the optimal one. +The lecture proceeds as follows: + +1. We set up the inventory model and solve it exactly via value function iteration. +2. We introduce Q-factors and derive the Q-factor Bellman equation. +3. We implement Q-learning and show the learned policy converges to the optimal one. + We will use the following imports: ```{code-cell} ipython3 @@ -85,12 +91,14 @@ $$ \qquad \text{where} \quad - h(x,a,d) := (x - d) \vee 0 + a. + h(x,a,d) := \max(x - d, 0) + a. $$ The term $A_t$ is units of stock ordered this period, which arrive at the start of period $t+1$, after demand $D_{t+1}$ is realized and served. +**Timeline for period $t$:** observe $X_t$ → choose $A_t$ → demand $D_{t+1}$ arrives → profit realized → $X_{t+1}$ determined. + (We use a $t$ subscript in $A_t$ to indicate the information set: it is chosen before $D_{t+1}$ is observed.) @@ -99,7 +107,7 @@ We assume that the firm can store at most $K$ items at one time. Profits are given by $$ - \pi(X_t, A_t, D_{t+1}) := X_t \wedge D_{t+1} - c A_t - \kappa 1\{A_t > 0\}. + \pi(X_t, A_t, D_{t+1}) := \min(X_t, D_{t+1}) - c A_t - \kappa 1\{A_t > 0\}. $$ Here @@ -158,7 +166,7 @@ $$ \sum_d \phi(d) \left[ \pi(x, a, d) + \beta \, v(h(x, a, d)) \right] $$ -When $r > 0$, the sequence $v_{k+1} = T v_k$ converges to the +When $r > 0$ (equivalently, $\beta < 1$), the sequence $v_{k+1} = T v_k$ converges to the unique fixed point $v^*$, which is the value function of the optimal policy (see, e.g., {cite}`Sargent_Stachurski_2025`). @@ -167,7 +175,7 @@ unique fixed point $v^*$, which is the value function of the optimal policy We store the model primitives in a `NamedTuple`. -Demand follows a geometric distribution with parameter $p$, so $\phi(d) = (1 - p)^d \, p$ for $d = 0, 1, 2, \ldots$. +Demand follows a [geometric distribution](https://en.wikipedia.org/wiki/Geometric_distribution) with parameter $p$, so $\phi(d) = (1 - p)^d \, p$ for $d = 0, 1, 2, \ldots$. ```{code-cell} ipython3 class Model(NamedTuple): @@ -427,7 +435,7 @@ transition function. ### The Q-learning update rule -Q-learning approximates the fixed point of the Q-factor Bellman equation using **stochastic approximation**. +Q-learning approximates the fixed point of the Q-factor Bellman equation using **[stochastic approximation](https://en.wikipedia.org/wiki/Stochastic_approximation)**. At each step, the agent is in state $x$, takes action $a$, observes reward $R_{t+1} = \pi(x, a, D_{t+1})$ and next state $X_{t+1} = h(x, a, D_{t+1})$, and @@ -443,8 +451,23 @@ where $\alpha_t$ is the learning rate. The update blends the current estimate $q_t(x, a)$ with a fresh sample of the Bellman target. +### What the manager needs to know + +Notice what is **not** required to implement the update. + +The manager does not need to know the demand distribution $\phi$, the unit cost $c$, the fixed cost $\kappa$, or the transition function $h$. + +All the manager needs to observe at each step is: + +1. the current inventory level $x$, +2. the order quantity $a$ they chose, +3. the resulting profit $R_{t+1}$ (which appears on the books), and +4. the next inventory level $X_{t+1}$ (which they can read off the warehouse). + +These are all directly observable quantities — no model knowledge is required. + -### The Q-table and the behavior policy +### The Q-table and the role of the max It is important to understand how the update rule relates to the manager's actions. @@ -489,7 +512,9 @@ By contrast, the original update with the $\max$ is a stochastic sample of the Bellman *optimality* operator, whose fixed point is $q^*$. The $\max$ in the update target is therefore what drives convergence to $q^*$. -**The behavior policy.** +In short, the $\max$ is doing the work of finding the optimum; without it, you only evaluate a fixed policy. + +### The behavior policy The rule governing how the manager chooses actions is called the **behavior policy**. Because the $\max$ in the update target always points toward $q^*$ @@ -511,26 +536,11 @@ In practice, we want the manager to mostly take good actions (to earn reasonable profits while learning), while still occasionally experimenting to discover better alternatives. -### What the manager needs to know - -Notice what is **not** required to implement the update. - -The manager does not need to know the demand distribution $\phi$, the unit cost $c$, the fixed cost $\kappa$, or the transition function $h$. - -All the manager needs to observe at each step is: - -1. the current inventory level $x$, -2. the order quantity $a$ they chose, -3. the resulting profit $R_{t+1}$ (which appears on the books), and -4. the next inventory level $X_{t+1}$ (which they can read off the warehouse). - -These are all directly observable quantities — no model knowledge is required. - ### Learning rate We use $\alpha_t = 1 / n_t(x, a)^{0.51}$, where $n_t(x, a)$ is the number of times the pair $(x, a)$ has been visited up to time $t$. -This decays slowly enough to allow learning from later (better-informed) updates, while still satisfying the Robbins-Monro conditions for convergence. +This decays slowly enough to allow learning from later (better-informed) updates, while still satisfying the [Robbins–Monro conditions](https://en.wikipedia.org/wiki/Stochastic_approximation#Robbins%E2%80%93Monro_algorithm) for convergence. ### Exploration: epsilon-greedy @@ -593,7 +603,7 @@ def q_learning_kernel(K, p, c, κ, β, n_steps, X_init, snapshots[snap_idx] = greedy_policy_from_q(q, K) snap_idx += 1 - # === Observe outcome === + # === Draw D_{t+1} and observe outcome === d = np.random.geometric(p) - 1 reward = min(x, d) - c * a - κ * (a > 0) x_next = max(x - d, 0) + a @@ -661,6 +671,7 @@ and compare them against $v^*$ and $\sigma^*$ from VFI. ```{code-cell} ipython3 K = len(x_values) - 1 +# restrict to feasible actions a ∈ {0, ..., K-x} v_q = np.array([np.max(q[x, :K - x + 1]) for x in range(K + 1)]) σ_q = np.array([np.argmax(q[x, :K - x + 1]) for x in range(K + 1)]) ``` diff --git a/lectures/rs_inventory_q.ipynb b/lectures/rs_inventory_q.ipynb new file mode 100644 index 000000000..99dc4f676 --- /dev/null +++ b/lectures/rs_inventory_q.ipynb @@ -0,0 +1,977 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "17be60e4", + "metadata": {}, + "source": [ + "(rs_inventory_q)=\n", + "```{raw} jupyter\n", + "
\n", + " \n", + " \"QuantEcon\"\n", + " \n", + "
\n", + "```\n", + "\n", + "# Risk-Sensitive Inventory Management via Q-Learning\n", + "\n", + "```{contents} Contents\n", + ":depth: 2\n", + "```\n", + "\n", + "## Introduction\n", + "\n", + "In {ref}`inventory_q`, we looked at an inventory management\n", + "problem and solved it with both value function iteration and Q-learning.\n", + "\n", + "In this lecture, we consider a risk-sensitive variation.\n", + "\n", + "Injection of risk-sensitivity acknowledges the fact\n", + "that, in incomplete markets with financial and informational frictions, firms\n", + "typically take risk into account in their decision making.\n", + "\n", + "In other words, the actions of firms are not, in general, risk neutral.\n", + "\n", + "One natural way to handle this is to use a risk-sensitive version of the Bellman\n", + "equation.\n", + "\n", + "We show how the model can be solved using value function iteration.\n", + "\n", + "We then investigate how risk sensitivity affects the optimal policy." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95f2dc54", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import numba\n", + "import matplotlib.pyplot as plt\n", + "from typing import NamedTuple" + ] + }, + { + "cell_type": "markdown", + "id": "acc004dc", + "metadata": {}, + "source": [ + "## The Model\n", + "\n", + "The Bellman equation for the inventory management problem in {ref}`inventory_q` has the form\n", + "\n", + "\n", + "$$\n", + " v(x)\n", + " = \\max_{a \\in \\Gamma(x)} \\mathbb E\n", + " \\left[\n", + " \\pi(x, a, D)\n", + " + \\beta v(h(x, a, D))\n", + " \\right].\n", + "$$\n", + "\n", + "Here $D$ is a random variable with distribution $\\phi$.\n", + "\n", + "(Primitives and definitions are the same as in {ref}`inventory_q`.)\n", + "\n", + "The risk-sensitive version of this Bellman equation has the form\n", + "\n", + "$$\n", + " v(x)\n", + " = \\max_{a \\in \\Gamma(x)}\n", + " \\phi^{-1}\n", + " \\left\\{\n", + " \\mathbb E \\phi\n", + " \\left[\n", + " \\pi(x, a, D)\n", + " + \\beta v(h(x, a, D))\n", + " \\right]\n", + " \\right\\},\n", + "$$\n", + "\n", + "where $\\phi(t) = \\exp(-\\gamma t)$ for fixed $\\gamma > 0$.\n", + "\n", + "Since $\\phi^{-1}(y) = -\\frac{1}{\\gamma} \\ln(y)$, the Bellman equation becomes\n", + "\n", + "$$\n", + " v(x)\n", + " = \\max_{a \\in \\Gamma(x)}\n", + " \\left(\n", + " -\\frac{1}{\\gamma}\n", + " \\right)\n", + " \\ln\n", + " \\left\\{\n", + " \\sum_d \\phi(d) \\exp\n", + " \\left[\n", + " -\\gamma \\left( \\pi(x, a, d) + \\beta \\, v(h(x, a, d)) \\right)\n", + " \\right]\n", + " \\right\\}.\n", + "$$\n", + "\n", + "Here $\\phi(d)$ denotes the demand probability mass function, as in {ref}`inventory_q`.\n", + "\n", + "The parameter $\\gamma$ controls the degree of risk sensitivity.\n", + "\n", + "As $\\gamma \\to 0$, the certainty equivalent reduces to the ordinary expectation and we recover the risk-neutral case.\n", + "\n", + "Larger $\\gamma$ means more aversion to downside risk.\n", + "\n", + "The Bellman operator, greedy policy, and VFI algorithm all carry over from the\n", + "risk-neutral case, with the expectation replaced by the certainty equivalent.\n", + "\n", + "\n", + "\n", + "## Solving via Value Function Iteration\n", + "\n", + "### Model specification\n", + "\n", + "We reuse the same model primitives as in {ref}`inventory_q`, adding $\\gamma$ as a parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7909494", + "metadata": {}, + "outputs": [], + "source": [ + "class RSModel(NamedTuple):\n", + " x_values: np.ndarray # Inventory values\n", + " d_values: np.ndarray # Demand values for summation\n", + " ϕ_values: np.ndarray # Demand probabilities\n", + " p: float # Demand parameter\n", + " c: float # Unit cost\n", + " κ: float # Fixed cost\n", + " β: float # Discount factor\n", + " γ: float # Risk-sensitivity parameter" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8cf486f", + "metadata": {}, + "outputs": [], + "source": [ + "def create_rs_inventory_model(\n", + " K: int = 20, # Max inventory\n", + " D_MAX: int = 21, # Demand upper bound for summation\n", + " p: float = 0.7,\n", + " c: float = 0.2,\n", + " κ: float = 0.8,\n", + " β: float = 0.98,\n", + " γ: float = 1.0\n", + " ) -> RSModel:\n", + "\n", + " def demand_pdf(p, d):\n", + " return (1 - p)**d * p\n", + "\n", + " d_values = np.arange(D_MAX)\n", + " ϕ_values = demand_pdf(p, d_values)\n", + " x_values = np.arange(K + 1)\n", + "\n", + " return RSModel(x_values, d_values, ϕ_values, p, c, κ, β, γ)" + ] + }, + { + "cell_type": "markdown", + "id": "060e0b9c", + "metadata": {}, + "source": [ + "### The Bellman operator\n", + "\n", + "The risk-sensitive Bellman operator replaces the expected value with the certainty equivalent.\n", + "\n", + "For numerical stability, we use the [log-sum-exp trick](https://en.wikipedia.org/wiki/LogSumExp): given values $z_i = \\pi(x, a, d_i) + \\beta \\, v(h(x, a, d_i))$, we compute\n", + "\n", + "$$\n", + " -\\frac{1}{\\gamma} \\ln \\sum_i \\phi(d_i) \\exp(-\\gamma z_i)\n", + " \\;=\\;\n", + " -\\frac{1}{\\gamma}\n", + " \\left(\n", + " m + \\ln \\sum_i \\phi(d_i) \\exp(-\\gamma z_i - m)\n", + " \\right),\n", + "$$\n", + "\n", + "where $m = \\max_i (-\\gamma z_i)$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ecff396", + "metadata": {}, + "outputs": [], + "source": [ + "@numba.jit(nopython=True)\n", + "def T_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K):\n", + " new_v = np.empty(K + 1)\n", + " n_d = len(d_values)\n", + " for x in range(K + 1):\n", + " best = -np.inf\n", + " for a in range(K - x + 1):\n", + " # Compute -γ * z_i for each demand realization\n", + " exponents = np.empty(n_d)\n", + " for i in range(n_d):\n", + " d = d_values[i]\n", + " x_next = max(x - d, 0) + a\n", + " revenue = min(x, d)\n", + " cost = c * a + κ * (a > 0)\n", + " z_i = revenue - cost + β * v[x_next]\n", + " exponents[i] = -γ * z_i\n", + " # Log-sum-exp trick for numerical stability\n", + " m = np.max(exponents)\n", + " weighted_sum = 0.0\n", + " for i in range(n_d):\n", + " weighted_sum += ϕ_values[i] * np.exp(exponents[i] - m)\n", + " val = -(1.0 / γ) * (m + np.log(weighted_sum))\n", + " if val > best:\n", + " best = val\n", + " new_v[x] = best\n", + " return new_v" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fdd1618", + "metadata": {}, + "outputs": [], + "source": [ + "def T_rs(v, model):\n", + " \"\"\"The risk-sensitive Bellman operator.\"\"\"\n", + " x_values, d_values, ϕ_values, p, c, κ, β, γ = model\n", + " K = len(x_values) - 1\n", + " return T_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K)" + ] + }, + { + "cell_type": "markdown", + "id": "03fbea31", + "metadata": {}, + "source": [ + "### Computing the greedy policy\n", + "\n", + "The greedy policy records the maximizing action instead of the maximized value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7eeb7faf", + "metadata": {}, + "outputs": [], + "source": [ + "@numba.jit(nopython=True)\n", + "def get_greedy_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K):\n", + " σ = np.empty(K + 1, dtype=np.int32)\n", + " n_d = len(d_values)\n", + " for x in range(K + 1):\n", + " best = -np.inf\n", + " best_a = 0\n", + " for a in range(K - x + 1):\n", + " exponents = np.empty(n_d)\n", + " for i in range(n_d):\n", + " d = d_values[i]\n", + " x_next = max(x - d, 0) + a\n", + " revenue = min(x, d)\n", + " cost = c * a + κ * (a > 0)\n", + " z_i = revenue - cost + β * v[x_next]\n", + " exponents[i] = -γ * z_i\n", + " m = np.max(exponents)\n", + " weighted_sum = 0.0\n", + " for i in range(n_d):\n", + " weighted_sum += ϕ_values[i] * np.exp(exponents[i] - m)\n", + " val = -(1.0 / γ) * (m + np.log(weighted_sum))\n", + " if val > best:\n", + " best = val\n", + " best_a = a\n", + " σ[x] = best_a\n", + " return σ" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5473245", + "metadata": {}, + "outputs": [], + "source": [ + "def get_greedy_rs(v, model):\n", + " \"\"\"Get a v-greedy policy for the risk-sensitive model.\"\"\"\n", + " x_values, d_values, ϕ_values, p, c, κ, β, γ = model\n", + " K = len(x_values) - 1\n", + " return get_greedy_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K)" + ] + }, + { + "cell_type": "markdown", + "id": "a6a3b8be", + "metadata": {}, + "source": [ + "### Value function iteration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ea93ea8", + "metadata": {}, + "outputs": [], + "source": [ + "def solve_rs_inventory_model(v_init, model, max_iter=10_000, tol=1e-6):\n", + " v = v_init.copy()\n", + " i, error = 0, tol + 1\n", + "\n", + " while i < max_iter and error > tol:\n", + " new_v = T_rs(v, model)\n", + " error = np.max(np.abs(new_v - v))\n", + " i += 1\n", + " v = new_v\n", + "\n", + " print(f\"Converged in {i} iterations with error {error:.2e}\")\n", + "\n", + " σ = get_greedy_rs(v, model)\n", + " return v, σ" + ] + }, + { + "cell_type": "markdown", + "id": "962b1a57", + "metadata": {}, + "source": [ + "### Creating and solving an instance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6fe6712", + "metadata": {}, + "outputs": [], + "source": [ + "model = create_rs_inventory_model()\n", + "x_values = model.x_values\n", + "n_x = len(x_values)\n", + "v_init = np.zeros(n_x)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3dc85cec", + "metadata": {}, + "outputs": [], + "source": [ + "v_star, σ_star = solve_rs_inventory_model(v_init, model)" + ] + }, + { + "cell_type": "markdown", + "id": "24295d26", + "metadata": {}, + "source": [ + "### Effect of risk sensitivity on the optimal policy\n", + "\n", + "We solve the model for several values of $\\gamma$ and compare the resulting policies.\n", + "\n", + "As we will see, a risk-sensitive firm orders less aggressively than a risk-neutral one." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c599f91f", + "metadata": {}, + "outputs": [], + "source": [ + "γ_values = [0.01, 1.0, 2.0]\n", + "results = {}\n", + "\n", + "for γ in γ_values:\n", + " mod = create_rs_inventory_model(γ=γ)\n", + " v, σ = solve_rs_inventory_model(np.zeros(n_x), mod)\n", + " results[γ] = (v, σ)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bfb69b17", + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(1, 2, figsize=(9.6, 3.2))\n", + "\n", + "for γ in γ_values:\n", + " v, σ = results[γ]\n", + " axes[0].plot(x_values, v, label=f\"$\\\\gamma = {γ}$\")\n", + " axes[1].plot(x_values, σ, label=f\"$\\\\gamma = {γ}$\")\n", + "\n", + "axes[0].set_xlabel(\"inventory\")\n", + "axes[0].set_ylabel(\"value\")\n", + "axes[0].legend()\n", + "axes[0].set_title(\"Value function\")\n", + "\n", + "axes[1].set_xlabel(\"inventory\")\n", + "axes[1].set_ylabel(\"order quantity\")\n", + "axes[1].legend()\n", + "axes[1].set_title(\"Policy\")\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "55ed320c", + "metadata": {}, + "source": [ + "### Simulating the optimal policy\n", + "\n", + "We simulate inventory dynamics under the optimal policy for the baseline $\\gamma$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7bfdd38c", + "metadata": {}, + "outputs": [], + "source": [ + "@numba.jit(nopython=True)\n", + "def sim_inventories(ts_length, σ, p, X_init=0):\n", + " \"\"\"Simulate inventory dynamics under policy σ.\"\"\"\n", + " X = np.zeros(ts_length, dtype=np.int32)\n", + " X[0] = X_init\n", + " for t in range(ts_length - 1):\n", + " d = np.random.geometric(p) - 1\n", + " X[t+1] = max(X[t] - d, 0) + σ[X[t]]\n", + " return X" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "993bf7d8", + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(len(γ_values), 1,\n", + " figsize=(8, 2.0 * len(γ_values)),\n", + " sharex=True)\n", + "\n", + "ts_length = 200\n", + "sim_seed = 5678\n", + "K = len(x_values) - 1\n", + "\n", + "for i, γ in enumerate(γ_values):\n", + " v, σ = results[γ]\n", + " np.random.seed(sim_seed)\n", + " X = sim_inventories(ts_length, σ, model.p, X_init=K // 2)\n", + " axes[i].plot(X, alpha=0.7)\n", + " axes[i].set_ylabel(\"inventory\")\n", + " axes[i].set_title(f\"$\\\\gamma = {γ}$\")\n", + " axes[i].set_ylim(0, K + 2)\n", + "\n", + "axes[-1].set_xlabel(r\"$t$\")\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "f33d501a", + "metadata": {}, + "source": [ + "## Interpreting the Outcomes\n", + "\n", + "The plots above show that a more risk-sensitive firm (larger $\\gamma$) orders\n", + "less inventory and maintains lower stock levels.\n", + "\n", + "At first glance this may seem surprising: wouldn't holding more inventory\n", + "reduce variance by ensuring the firm can always meet demand?\n", + "\n", + "The key is to identify where the randomness in profits actually comes from.\n", + "\n", + "Recall that per-period profit is $\\pi(x, a, d) = \\min(x, d) - ca - \\kappa\n", + "\\mathbf{1}\\{a > 0\\}$.\n", + "\n", + "The ordering cost $ca + \\kappa \\mathbf{1}\\{a > 0\\}$ is **deterministic** — it\n", + "is chosen before the demand shock is realized.\n", + "\n", + "So higher ordering shifts the level of profits down but does not affect their\n", + "variance.\n", + "\n", + "The variance comes from **revenue**: $\\min(x, D)$.\n", + "\n", + "When inventory $x$ is high, $\\min(x, D) \\approx D$ for most demand\n", + "realizations — revenue inherits the full variance of demand.\n", + "\n", + "When inventory $x$ is low, $\\min(x, D) \\approx x$ for most realizations —\n", + "revenue is nearly deterministic, capped at the inventory level.\n", + "\n", + "A risk-sensitive agent therefore prefers lower inventory because it **caps the\n", + "randomness of revenue**.\n", + "\n", + "The agent accepts lower expected sales in exchange for more predictable profits.\n", + "\n", + "There is also a continuation value channel: next-period inventory $\\max(x - D,\n", + "0) + a$ varies with $D$, and higher $x$ means $x - D$ tracks $D$ more\n", + "closely, propagating that variance forward through $v$.\n", + "\n", + "\n", + "## Q-Learning\n", + "\n", + "We now ask whether the optimal policy can be learned without knowledge of the\n", + "model, as we did in the risk-neutral case in {ref}`inventory_q`.\n", + "\n", + "### The Q-factor\n", + "\n", + "The first step is to define the Q-factor in a way that is compatible with the\n", + "risk-sensitive Bellman equation.\n", + "\n", + "We define\n", + "\n", + "$$\n", + " q(x, a) := \\mathbb E\n", + " \\left[\n", + " \\exp\\!\\left(\n", + " -\\gamma \\left( \\pi(x, a, D) + \\beta \\, v^*(h(x, a, D)) \\right)\n", + " \\right)\n", + " \\right].\n", + "$$\n", + "\n", + "In words, $q(x, a)$ applies the risk-sensitivity transformation $\\phi(t) =\n", + "\\exp(-\\gamma t)$ inside the expectation, evaluated at the return from taking\n", + "action $a$ in state $x$ and following the optimal policy thereafter.\n", + "\n", + "### Deriving the Q-factor Bellman equation\n", + "\n", + "Our goal is to obtain a fixed point equation in $q$ alone, eliminating $v^*$.\n", + "\n", + "**Step 1.** Express $v^*$ in terms of $q$.\n", + "\n", + "The risk-sensitive Bellman equation says $v^*(x) = \\max_{a \\in \\Gamma(x)}\n", + "\\phi^{-1}(q(x, a))$.\n", + "\n", + "Since $\\phi^{-1}(y) = -\\frac{1}{\\gamma} \\ln(y)$ is a **decreasing** function,\n", + "the maximum over $a$ of $\\phi^{-1}(q(x, a))$ corresponds to the **minimum**\n", + "over $a$ of $q(x, a)$:\n", + "\n", + "$$\n", + " v^*(x)\n", + " = \\phi^{-1}\\!\\left(\\min_{a \\in \\Gamma(x)} q(x, a)\\right)\n", + " = -\\frac{1}{\\gamma} \\ln\\!\\left(\\min_{a \\in \\Gamma(x)} q(x, a)\\right).\n", + "$$\n", + "\n", + "Equivalently,\n", + "\n", + "$$\n", + " \\exp(-\\gamma \\, v^*(x)) = \\min_{a \\in \\Gamma(x)} q(x, a).\n", + "$$\n", + "\n", + "**Step 2.** Substitute back into the definition of $q$ to eliminate $v^*$.\n", + "\n", + "Expanding the exponential in the definition of $q$,\n", + "\n", + "$$\n", + " q(x, a)\n", + " = \\mathbb E\n", + " \\left[\n", + " \\exp(-\\gamma \\, \\pi(x, a, D))\n", + " \\;\\cdot\\;\n", + " \\exp\\!\\left(-\\gamma \\beta \\, v^*(x')\\right)\n", + " \\right]\n", + "$$\n", + "\n", + "where $x' = h(x, a, D)$.\n", + "\n", + "From Step 1, $\\exp(-\\gamma \\, v^*(x')) = \\min_{a' \\in \\Gamma(x')} q(x', a')$,\n", + "so $\\exp(-\\gamma \\beta \\, v^*(x')) = \\left[\\min_{a' \\in \\Gamma(x')} q(x',\n", + "a')\\right]^\\beta$.\n", + "\n", + "Substituting,\n", + "\n", + "$$\n", + " q(x, a)\n", + " = \\mathbb E\n", + " \\left[\n", + " \\exp(-\\gamma \\, \\pi(x, a, D))\n", + " \\;\\cdot\\;\n", + " \\left(\\min_{a' \\in \\Gamma(x')} q(x', a')\\right)^\\beta\n", + " \\right].\n", + "$$\n", + "\n", + "This is a fixed point equation in $q$ alone — $v^*$ has been eliminated.\n", + "\n", + "### The Q-learning update rule\n", + "\n", + "As in the risk-neutral case, we approximate the fixed point using stochastic\n", + "approximation.\n", + "\n", + "At each step, the agent is in state $x$, takes action $a$, observes profit\n", + "$R_{t+1} = \\pi(x, a, D_{t+1})$ and next state $X_{t+1} = h(x, a, D_{t+1})$,\n", + "and updates\n", + "\n", + "$$\n", + " q_{t+1}(x, a)\n", + " = (1 - \\alpha_t) \\, q_t(x, a)\n", + " + \\alpha_t\n", + " \\left[\n", + " \\exp(-\\gamma \\, R_{t+1})\n", + " \\;\\cdot\\;\n", + " \\left(\\min_{a' \\in \\Gamma(X_{t+1})} q_t(X_{t+1}, a')\\right)^\\beta\n", + " \\right].\n", + "$$\n", + "\n", + "The term in brackets is a single-sample estimate of the right-hand side of the\n", + "Q-factor Bellman equation.\n", + "\n", + "The update blends the current estimate with this fresh sample, just as in\n", + "standard Q-learning.\n", + "\n", + "Notice several differences from the risk-neutral case:\n", + "\n", + "- The Q-values are **positive** (expectations of exponentials) rather than signed.\n", + "- The optimal policy is $\\sigma(x) = \\arg\\min_a q(x, a)$ — we **minimize**\n", + " rather than maximize, because $\\phi^{-1}$ is decreasing.\n", + "- The observed profit enters through $\\exp(-\\gamma R_{t+1})$ rather than\n", + " additively.\n", + "- The continuation value enters as a **power** $(\\min_{a'} q_t)^\\beta$ rather\n", + " than a scaled sum $\\beta \\cdot \\max_{a'} q_t$.\n", + "\n", + "As before, the agent needs only to observe $x$, $a$, $R_{t+1}$, and\n", + "$X_{t+1}$ — no model knowledge is required.\n", + "\n", + "### Implementation plan\n", + "\n", + "Our implementation follows the same structure as the risk-neutral Q-learning in\n", + "{ref}`inventory_q`, with the modifications above:\n", + "\n", + "1. **Initialize** the Q-table $q$ to ones (since Q-values are positive) and\n", + " visit counts $n$ to zeros.\n", + "2. **At each step:**\n", + " - Draw demand $D_{t+1}$ and compute observed profit $R_{t+1}$ and next state\n", + " $X_{t+1}$.\n", + " - Compute $\\min_{a'} q_t(X_{t+1}, a')$ over feasible actions (this is a\n", + " scalar for the update target, and the $\\arg\\min$ action is used by the\n", + " $\\varepsilon$-greedy behavior policy).\n", + " - Update $q_t(x, a)$ using the rule above, with learning rate\n", + " $\\alpha_t = 1 / n_t(x, a)^{0.51}$.\n", + " - Choose the next action via $\\varepsilon$-greedy: with probability\n", + " $\\varepsilon$ pick a random feasible action, otherwise pick the\n", + " $\\arg\\min$ action.\n", + " - Decay $\\varepsilon$.\n", + "3. **Extract the greedy policy** from the final Q-table via\n", + " $\\sigma(x) = \\arg\\min_{a \\in \\Gamma(x)} q(x, a)$.\n", + "4. **Compare** the learned policy against the VFI solution.\n", + "\n", + "### Implementation\n", + "\n", + "We first define a helper to extract the greedy policy from the Q-table.\n", + "\n", + "Since the optimal policy minimizes $q$, we use $\\arg\\min$ rather than $\\arg\\max$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed938977", + "metadata": {}, + "outputs": [], + "source": [ + "@numba.jit(nopython=True)\n", + "def greedy_policy_from_q_rs(q, K):\n", + " \"\"\"Extract greedy policy from risk-sensitive Q-table (argmin).\"\"\"\n", + " σ = np.empty(K + 1, dtype=np.int32)\n", + " for x in range(K + 1):\n", + " best_val = np.inf\n", + " best_a = 0\n", + " for a in range(K - x + 1):\n", + " if q[x, a] < best_val:\n", + " best_val = q[x, a]\n", + " best_a = a\n", + " σ[x] = best_a\n", + " return σ" + ] + }, + { + "cell_type": "markdown", + "id": "db9043ac", + "metadata": {}, + "source": [ + "The Q-learning loop mirrors the risk-neutral version, with the key changes:\n", + "Q-table initialized to ones, the update target uses $\\exp(-\\gamma R_{t+1})\n", + "\\cdot (\\min_{a'} q_t)^\\beta$, and the behavior policy follows the $\\arg\\min$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73fcd434", + "metadata": {}, + "outputs": [], + "source": [ + "@numba.jit(nopython=True)\n", + "def q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init,\n", + " ε_init, ε_min, ε_decay, snapshot_steps):\n", + " q = np.ones((K + 1, K + 1)) # positive Q-values, initialized to 1\n", + " n = np.zeros((K + 1, K + 1)) # visit counts for learning rate\n", + " ε = ε_init\n", + "\n", + " n_snaps = len(snapshot_steps)\n", + " snapshots = np.zeros((n_snaps, K + 1), dtype=np.int32)\n", + " snap_idx = 0\n", + "\n", + " # Initialize state and action\n", + " x = X_init\n", + " a = np.random.randint(0, K - x + 1)\n", + "\n", + " for t in range(n_steps):\n", + " # Record policy snapshot if needed\n", + " if snap_idx < n_snaps and t == snapshot_steps[snap_idx]:\n", + " snapshots[snap_idx] = greedy_policy_from_q_rs(q, K)\n", + " snap_idx += 1\n", + "\n", + " # === Draw D_{t+1} and observe outcome ===\n", + " d = np.random.geometric(p) - 1\n", + " reward = min(x, d) - c * a - κ * (a > 0)\n", + " x_next = max(x - d, 0) + a\n", + "\n", + " # === Min over next state (scalar value for update target) ===\n", + " # Also record the argmin action for use by the behavior policy.\n", + " best_next = np.inf\n", + " a_next = 0\n", + " for aa in range(K - x_next + 1):\n", + " if q[x_next, aa] < best_next:\n", + " best_next = q[x_next, aa]\n", + " a_next = aa\n", + "\n", + " # === Risk-sensitive Q-learning update ===\n", + " target = np.exp(-γ * reward) * best_next ** β\n", + " n[x, a] += 1\n", + " α = 1.0 / n[x, a] ** 0.51\n", + " q[x, a] = (1 - α) * q[x, a] + α * target\n", + "\n", + " # === Behavior policy: ε-greedy (uses a_next, the argmin action) ===\n", + " x = x_next\n", + " if np.random.random() < ε:\n", + " a = np.random.randint(0, K - x + 1)\n", + " else:\n", + " a = a_next\n", + " ε = max(ε_min, ε * ε_decay)\n", + "\n", + " return q, snapshots" + ] + }, + { + "cell_type": "markdown", + "id": "72492777", + "metadata": {}, + "source": [ + "The wrapper function unpacks the model and provides default hyperparameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8cd9e8c5", + "metadata": {}, + "outputs": [], + "source": [ + "def q_learning_rs(model, n_steps=20_000_000, X_init=0,\n", + " ε_init=1.0, ε_min=0.01, ε_decay=0.999999,\n", + " snapshot_steps=None):\n", + " x_values, d_values, ϕ_values, p, c, κ, β, γ = model\n", + " K = len(x_values) - 1\n", + " if snapshot_steps is None:\n", + " snapshot_steps = np.array([], dtype=np.int64)\n", + " return q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init,\n", + " ε_init, ε_min, ε_decay, snapshot_steps)" + ] + }, + { + "cell_type": "markdown", + "id": "9b095eef", + "metadata": {}, + "source": [ + "### Running Q-learning\n", + "\n", + "We run 20 million steps and take policy snapshots at steps 10,000, 1,000,000, and at the end." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38386ce1", + "metadata": {}, + "outputs": [], + "source": [ + "np.random.seed(1234)\n", + "snap_steps = np.array([10_000, 1_000_000, 19_999_999], dtype=np.int64)\n", + "q_table, snapshots = q_learning_rs(model, snapshot_steps=snap_steps)" + ] + }, + { + "cell_type": "markdown", + "id": "985c208b", + "metadata": {}, + "source": [ + "### Comparing with the exact solution\n", + "\n", + "We extract the value function and policy from the final Q-table.\n", + "\n", + "Since Q-values represent $\\mathbb{E}[\\exp(-\\gamma(\\cdots))]$, we recover the\n", + "value function via $v_Q(x) = -\\frac{1}{\\gamma} \\ln(\\min_{a} q(x, a))$ and the\n", + "policy via $\\sigma_Q(x) = \\arg\\min_a q(x, a)$." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dccdb8f4", + "metadata": {}, + "outputs": [], + "source": [ + "K = len(x_values) - 1\n", + "γ_base = model.γ\n", + "# restrict to feasible actions a ∈ {0, ..., K-x}\n", + "v_q = np.array([-(1/γ_base) * np.log(np.min(q_table[x, :K - x + 1]))\n", + " for x in range(K + 1)])\n", + "σ_q = np.array([np.argmin(q_table[x, :K - x + 1])\n", + " for x in range(K + 1)])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a855f91", + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(1, 2, figsize=(9.6, 3.2))\n", + "\n", + "axes[0].plot(x_values, v_star, label=\"VFI\")\n", + "axes[0].plot(x_values, v_q, '--', label=\"Q-learning\")\n", + "axes[0].set_xlabel(\"inventory\")\n", + "axes[0].set_ylabel(\"value\")\n", + "axes[0].legend()\n", + "axes[0].set_title(\"Value function\")\n", + "\n", + "axes[1].plot(x_values, σ_star, label=\"VFI\")\n", + "axes[1].plot(x_values, σ_q, '--', label=\"Q-learning\")\n", + "axes[1].set_xlabel(\"inventory\")\n", + "axes[1].set_ylabel(\"order quantity\")\n", + "axes[1].legend()\n", + "axes[1].set_title(\"Policy\")\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "8579f07d", + "metadata": {}, + "source": [ + "### Visualizing learning over time\n", + "\n", + "The panels below show how the agent's policy evolves during training.\n", + "\n", + "Each panel simulates an inventory path using the greedy policy extracted from\n", + "the Q-table at a given training step, with the same demand sequence throughout.\n", + "\n", + "The top panel shows the optimal policy from VFI for reference." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "142d3906", + "metadata": {}, + "outputs": [], + "source": [ + "ts_length = 200\n", + "n_snaps = len(snap_steps)\n", + "fig, axes = plt.subplots(n_snaps + 1, 1, figsize=(8, 2.0 * (n_snaps + 1)),\n", + " sharex=True)\n", + "\n", + "X_init = K // 2\n", + "sim_seed = 5678\n", + "\n", + "# Optimal policy\n", + "np.random.seed(sim_seed)\n", + "X_opt = sim_inventories(ts_length, σ_star, model.p, X_init)\n", + "axes[0].plot(X_opt, alpha=0.7)\n", + "axes[0].set_ylabel(\"inventory\")\n", + "axes[0].set_title(\"Optimal (VFI)\")\n", + "axes[0].set_ylim(0, K + 2)\n", + "\n", + "# Q-learning snapshots\n", + "for i in range(n_snaps):\n", + " σ_snap = snapshots[i]\n", + " np.random.seed(sim_seed)\n", + " X = sim_inventories(ts_length, σ_snap, model.p, X_init)\n", + " axes[i + 1].plot(X, alpha=0.7)\n", + " axes[i + 1].set_ylabel(\"inventory\")\n", + " axes[i + 1].set_title(f\"Step {snap_steps[i]:,}\")\n", + " axes[i + 1].set_ylim(0, K + 2)\n", + "\n", + "axes[-1].set_xlabel(r\"$t$\")\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "274e77d9", + "metadata": {}, + "source": [ + "After 10,000 steps the agent has barely explored and its policy is erratic.\n", + "\n", + "By 1,000,000 steps the learned policy begins to resemble the optimal one, and\n", + "by step 20 million the inventory dynamics are nearly indistinguishable from the\n", + "VFI solution.\n", + "\n", + "Note that the converged policy maintains lower inventory levels than in the\n", + "risk-neutral case (compare with {ref}`inventory_q`), consistent with the\n", + "mechanism discussed above: a risk-sensitive agent caps its exposure to demand\n", + "variance by holding less stock.\n", + "\n", + "## Conclusion\n", + "\n", + "We extended the inventory management problem from {ref}`inventory_q` to\n", + "incorporate risk sensitivity via the certainty equivalent operator\n", + "$\\phi^{-1}(\\mathbb{E}[\\phi(\\cdot)])$ with $\\phi(t) = \\exp(-\\gamma t)$.\n", + "\n", + "Value function iteration confirms that risk-sensitive firms order less\n", + "aggressively, preferring predictable profits over higher but more volatile\n", + "returns.\n", + "\n", + "We then showed that Q-learning can be adapted to the risk-sensitive setting by\n", + "working with the transformed Q-factor $q(x,a) =\n", + "\\mathbb{E}[\\exp(-\\gamma(\\pi + \\beta v^*))]$.\n", + "\n", + "The resulting update rule replaces addition with multiplication and max with\n", + "min, but retains the key property of model-free learning: the agent needs only\n", + "to observe states, actions, and profits." + ] + } + ], + "metadata": { + "jupytext": { + "default_lexer": "ipython3" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/lectures/rs_inventory_q.md b/lectures/rs_inventory_q.md new file mode 100644 index 000000000..6a368656b --- /dev/null +++ b/lectures/rs_inventory_q.md @@ -0,0 +1,761 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.18.1 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +(rs_inventory_q)= +```{raw} jupyter + +``` + +# Risk-Sensitive Inventory Management via Q-Learning + +```{contents} Contents +:depth: 2 +``` + +## Introduction + +In {ref}`inventory_q`, we looked at an inventory management +problem and solved it with both value function iteration and Q-learning. + +In this lecture, we consider a risk-sensitive variation. + +Injection of risk-sensitivity acknowledges the fact +that, in incomplete markets with financial and informational frictions, firms +typically take risk into account in their decision making. + +In other words, the actions of firms are not, in general, risk neutral. + +One natural way to handle this is to use a risk-sensitive version of the Bellman +equation. + +We show how the model can be solved using value function iteration. + +We then investigate how risk sensitivity affects the optimal policy. + + + +```{code-cell} ipython3 +import numpy as np +import numba +import matplotlib.pyplot as plt +from typing import NamedTuple +``` + + +## The Model + +The Bellman equation for the inventory management problem in {ref}`inventory_q` has the form + + +$$ + v(x) + = \max_{a \in \Gamma(x)} \mathbb E + \left[ + \pi(x, a, D) + + \beta v(h(x, a, D)) + \right]. +$$ + +Here $D$ is a random variable with distribution $\phi$. + +(Primitives and definitions are the same as in {ref}`inventory_q`.) + +The risk-sensitive version of this Bellman equation has the form + +$$ + v(x) + = \max_{a \in \Gamma(x)} + \phi^{-1} + \left\{ + \mathbb E \phi + \left[ + \pi(x, a, D) + + \beta v(h(x, a, D)) + \right] + \right\}, +$$ + +where $\phi(t) = \exp(-\gamma t)$ for fixed $\gamma > 0$. + +Since $\phi^{-1}(y) = -\frac{1}{\gamma} \ln(y)$, the Bellman equation becomes + +$$ + v(x) + = \max_{a \in \Gamma(x)} + \left( + -\frac{1}{\gamma} + \right) + \ln + \left\{ + \sum_d \phi(d) \exp + \left[ + -\gamma \left( \pi(x, a, d) + \beta \, v(h(x, a, d)) \right) + \right] + \right\}. +$$ + +Here $\phi(d)$ denotes the demand probability mass function, as in {ref}`inventory_q`. + +The parameter $\gamma$ controls the degree of risk sensitivity. + +As $\gamma \to 0$, the certainty equivalent reduces to the ordinary expectation and we recover the risk-neutral case. + +Larger $\gamma$ means more aversion to downside risk. + +The Bellman operator, greedy policy, and VFI algorithm all carry over from the +risk-neutral case, with the expectation replaced by the certainty equivalent. + + + +## Solving via Value Function Iteration + +### Model specification + +We reuse the same model primitives as in {ref}`inventory_q`, adding $\gamma$ as a parameter. + +```{code-cell} ipython3 +class RSModel(NamedTuple): + x_values: np.ndarray # Inventory values + d_values: np.ndarray # Demand values for summation + ϕ_values: np.ndarray # Demand probabilities + p: float # Demand parameter + c: float # Unit cost + κ: float # Fixed cost + β: float # Discount factor + γ: float # Risk-sensitivity parameter +``` + +```{code-cell} ipython3 +def create_rs_inventory_model( + K: int = 20, # Max inventory + D_MAX: int = 21, # Demand upper bound for summation + p: float = 0.7, + c: float = 0.2, + κ: float = 0.8, + β: float = 0.98, + γ: float = 1.0 + ) -> RSModel: + + def demand_pdf(p, d): + return (1 - p)**d * p + + d_values = np.arange(D_MAX) + ϕ_values = demand_pdf(p, d_values) + x_values = np.arange(K + 1) + + return RSModel(x_values, d_values, ϕ_values, p, c, κ, β, γ) +``` + +### The Bellman operator + +The risk-sensitive Bellman operator replaces the expected value with the certainty equivalent. + +For numerical stability, we use the [log-sum-exp trick](https://en.wikipedia.org/wiki/LogSumExp): given values $z_i = \pi(x, a, d_i) + \beta \, v(h(x, a, d_i))$, we compute + +$$ + -\frac{1}{\gamma} \ln \sum_i \phi(d_i) \exp(-\gamma z_i) + \;=\; + -\frac{1}{\gamma} + \left( + m + \ln \sum_i \phi(d_i) \exp(-\gamma z_i - m) + \right), +$$ + +where $m = \max_i (-\gamma z_i)$. + +```{code-cell} ipython3 +@numba.jit(nopython=True) +def T_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K): + new_v = np.empty(K + 1) + n_d = len(d_values) + for x in range(K + 1): + best = -np.inf + for a in range(K - x + 1): + # Compute -γ * z_i for each demand realization + exponents = np.empty(n_d) + for i in range(n_d): + d = d_values[i] + x_next = max(x - d, 0) + a + revenue = min(x, d) + cost = c * a + κ * (a > 0) + z_i = revenue - cost + β * v[x_next] + exponents[i] = -γ * z_i + # Log-sum-exp trick for numerical stability + m = np.max(exponents) + weighted_sum = 0.0 + for i in range(n_d): + weighted_sum += ϕ_values[i] * np.exp(exponents[i] - m) + val = -(1.0 / γ) * (m + np.log(weighted_sum)) + if val > best: + best = val + new_v[x] = best + return new_v +``` + +```{code-cell} ipython3 +def T_rs(v, model): + """The risk-sensitive Bellman operator.""" + x_values, d_values, ϕ_values, p, c, κ, β, γ = model + K = len(x_values) - 1 + return T_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K) +``` + + +### Computing the greedy policy + +The greedy policy records the maximizing action instead of the maximized value. + +```{code-cell} ipython3 +@numba.jit(nopython=True) +def get_greedy_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K): + σ = np.empty(K + 1, dtype=np.int32) + n_d = len(d_values) + for x in range(K + 1): + best = -np.inf + best_a = 0 + for a in range(K - x + 1): + exponents = np.empty(n_d) + for i in range(n_d): + d = d_values[i] + x_next = max(x - d, 0) + a + revenue = min(x, d) + cost = c * a + κ * (a > 0) + z_i = revenue - cost + β * v[x_next] + exponents[i] = -γ * z_i + m = np.max(exponents) + weighted_sum = 0.0 + for i in range(n_d): + weighted_sum += ϕ_values[i] * np.exp(exponents[i] - m) + val = -(1.0 / γ) * (m + np.log(weighted_sum)) + if val > best: + best = val + best_a = a + σ[x] = best_a + return σ +``` + +```{code-cell} ipython3 +def get_greedy_rs(v, model): + """Get a v-greedy policy for the risk-sensitive model.""" + x_values, d_values, ϕ_values, p, c, κ, β, γ = model + K = len(x_values) - 1 + return get_greedy_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K) +``` + +### Value function iteration + +```{code-cell} ipython3 +def solve_rs_inventory_model(v_init, model, max_iter=10_000, tol=1e-6): + v = v_init.copy() + i, error = 0, tol + 1 + + while i < max_iter and error > tol: + new_v = T_rs(v, model) + error = np.max(np.abs(new_v - v)) + i += 1 + v = new_v + + print(f"Converged in {i} iterations with error {error:.2e}") + + σ = get_greedy_rs(v, model) + return v, σ +``` + +### Creating and solving an instance + +```{code-cell} ipython3 +model = create_rs_inventory_model() +x_values = model.x_values +n_x = len(x_values) +v_init = np.zeros(n_x) +``` + +```{code-cell} ipython3 +v_star, σ_star = solve_rs_inventory_model(v_init, model) +``` + +### Effect of risk sensitivity on the optimal policy + +We solve the model for several values of $\gamma$ and compare the resulting policies. + +As we will see, a risk-sensitive firm orders less aggressively than a risk-neutral one. + +```{code-cell} ipython3 +γ_values = [0.01, 1.0, 2.0] +results = {} + +for γ in γ_values: + mod = create_rs_inventory_model(γ=γ) + v, σ = solve_rs_inventory_model(np.zeros(n_x), mod) + results[γ] = (v, σ) +``` + +```{code-cell} ipython3 +fig, axes = plt.subplots(1, 2, figsize=(9.6, 3.2)) + +for γ in γ_values: + v, σ = results[γ] + axes[0].plot(x_values, v, label=f"$\\gamma = {γ}$") + axes[1].plot(x_values, σ, label=f"$\\gamma = {γ}$") + +axes[0].set_xlabel("inventory") +axes[0].set_ylabel("value") +axes[0].legend() +axes[0].set_title("Value function") + +axes[1].set_xlabel("inventory") +axes[1].set_ylabel("order quantity") +axes[1].legend() +axes[1].set_title("Policy") + +plt.tight_layout() +plt.show() +``` + +### Simulating the optimal policy + +We simulate inventory dynamics under the optimal policy for the baseline $\gamma$. + +```{code-cell} ipython3 +@numba.jit(nopython=True) +def sim_inventories(ts_length, σ, p, X_init=0): + """Simulate inventory dynamics under policy σ.""" + X = np.zeros(ts_length, dtype=np.int32) + X[0] = X_init + for t in range(ts_length - 1): + d = np.random.geometric(p) - 1 + X[t+1] = max(X[t] - d, 0) + σ[X[t]] + return X +``` + +```{code-cell} ipython3 +fig, axes = plt.subplots(len(γ_values), 1, + figsize=(8, 2.0 * len(γ_values)), + sharex=True) + +ts_length = 200 +sim_seed = 5678 +K = len(x_values) - 1 + +for i, γ in enumerate(γ_values): + v, σ = results[γ] + np.random.seed(sim_seed) + X = sim_inventories(ts_length, σ, model.p, X_init=K // 2) + axes[i].plot(X, alpha=0.7) + axes[i].set_ylabel("inventory") + axes[i].set_title(f"$\\gamma = {γ}$") + axes[i].set_ylim(0, K + 2) + +axes[-1].set_xlabel(r"$t$") +plt.tight_layout() +plt.show() +``` + +## Interpreting the Outcomes + +The plots above show that a more risk-sensitive firm (larger $\gamma$) orders +less inventory and maintains lower stock levels. + +At first glance this may seem surprising: wouldn't holding more inventory +reduce variance by ensuring the firm can always meet demand? + +The key is to identify where the randomness in profits actually comes from. + +Recall that per-period profit is $\pi(x, a, d) = \min(x, d) - ca - \kappa +\mathbf{1}\{a > 0\}$. + +The ordering cost $ca + \kappa \mathbf{1}\{a > 0\}$ is **deterministic** — it +is chosen before the demand shock is realized. + +So higher ordering shifts the level of profits down but does not affect their +variance. + +The variance comes from **revenue**: $\min(x, D)$. + +When inventory $x$ is high, $\min(x, D) \approx D$ for most demand +realizations — revenue inherits the full variance of demand. + +When inventory $x$ is low, $\min(x, D) \approx x$ for most realizations — +revenue is nearly deterministic, capped at the inventory level. + +A risk-sensitive agent therefore prefers lower inventory because it **caps the +randomness of revenue**. + +The agent accepts lower expected sales in exchange for more predictable profits. + +There is also a continuation value channel: next-period inventory $\max(x - D, +0) + a$ varies with $D$, and higher $x$ means $x - D$ tracks $D$ more +closely, propagating that variance forward through $v$. + + +## Q-Learning + +We now ask whether the optimal policy can be learned without knowledge of the +model, as we did in the risk-neutral case in {ref}`inventory_q`. + +### The Q-factor + +The first step is to define the Q-factor in a way that is compatible with the +risk-sensitive Bellman equation. + +We define + +$$ + q(x, a) := \mathbb E + \left[ + \exp\!\left( + -\gamma \left( \pi(x, a, D) + \beta \, v^*(h(x, a, D)) \right) + \right) + \right]. +$$ + +In words, $q(x, a)$ applies the risk-sensitivity transformation $\phi(t) = +\exp(-\gamma t)$ inside the expectation, evaluated at the return from taking +action $a$ in state $x$ and following the optimal policy thereafter. + +### Deriving the Q-factor Bellman equation + +Our goal is to obtain a fixed point equation in $q$ alone, eliminating $v^*$. + +**Step 1.** Express $v^*$ in terms of $q$. + +The risk-sensitive Bellman equation says $v^*(x) = \max_{a \in \Gamma(x)} +\phi^{-1}(q(x, a))$. + +Since $\phi^{-1}(y) = -\frac{1}{\gamma} \ln(y)$ is a **decreasing** function, +the maximum over $a$ of $\phi^{-1}(q(x, a))$ corresponds to the **minimum** +over $a$ of $q(x, a)$: + +$$ + v^*(x) + = \phi^{-1}\!\left(\min_{a \in \Gamma(x)} q(x, a)\right) + = -\frac{1}{\gamma} \ln\!\left(\min_{a \in \Gamma(x)} q(x, a)\right). +$$ + +Equivalently, + +$$ + \exp(-\gamma \, v^*(x)) = \min_{a \in \Gamma(x)} q(x, a). +$$ + +**Step 2.** Substitute back into the definition of $q$ to eliminate $v^*$. + +Expanding the exponential in the definition of $q$, + +$$ + q(x, a) + = \mathbb E + \left[ + \exp(-\gamma \, \pi(x, a, D)) + \;\cdot\; + \exp\!\left(-\gamma \beta \, v^*(x')\right) + \right] +$$ + +where $x' = h(x, a, D)$. + +From Step 1, $\exp(-\gamma \, v^*(x')) = \min_{a' \in \Gamma(x')} q(x', a')$, +so $\exp(-\gamma \beta \, v^*(x')) = \left[\min_{a' \in \Gamma(x')} q(x', +a')\right]^\beta$. + +Substituting, + +$$ + q(x, a) + = \mathbb E + \left[ + \exp(-\gamma \, \pi(x, a, D)) + \;\cdot\; + \left(\min_{a' \in \Gamma(x')} q(x', a')\right)^\beta + \right]. +$$ + +This is a fixed point equation in $q$ alone — $v^*$ has been eliminated. + +### The Q-learning update rule + +As in the risk-neutral case, we approximate the fixed point using stochastic +approximation. + +At each step, the agent is in state $x$, takes action $a$, observes profit +$R_{t+1} = \pi(x, a, D_{t+1})$ and next state $X_{t+1} = h(x, a, D_{t+1})$, +and updates + +$$ + q_{t+1}(x, a) + = (1 - \alpha_t) \, q_t(x, a) + + \alpha_t + \left[ + \exp(-\gamma \, R_{t+1}) + \;\cdot\; + \left(\min_{a' \in \Gamma(X_{t+1})} q_t(X_{t+1}, a')\right)^\beta + \right]. +$$ + +The term in brackets is a single-sample estimate of the right-hand side of the +Q-factor Bellman equation. + +The update blends the current estimate with this fresh sample, just as in +standard Q-learning. + +Notice several differences from the risk-neutral case: + +- The Q-values are **positive** (expectations of exponentials) rather than signed. +- The optimal policy is $\sigma(x) = \arg\min_a q(x, a)$ — we **minimize** + rather than maximize, because $\phi^{-1}$ is decreasing. +- The observed profit enters through $\exp(-\gamma R_{t+1})$ rather than + additively. +- The continuation value enters as a **power** $(\min_{a'} q_t)^\beta$ rather + than a scaled sum $\beta \cdot \max_{a'} q_t$. + +As before, the agent needs only to observe $x$, $a$, $R_{t+1}$, and +$X_{t+1}$ — no model knowledge is required. + +### Implementation plan + +Our implementation follows the same structure as the risk-neutral Q-learning in +{ref}`inventory_q`, with the modifications above: + +1. **Initialize** the Q-table $q$ to ones (since Q-values are positive) and + visit counts $n$ to zeros. +2. **At each step:** + - Draw demand $D_{t+1}$ and compute observed profit $R_{t+1}$ and next state + $X_{t+1}$. + - Compute $\min_{a'} q_t(X_{t+1}, a')$ over feasible actions (this is a + scalar for the update target, and the $\arg\min$ action is used by the + $\varepsilon$-greedy behavior policy). + - Update $q_t(x, a)$ using the rule above, with learning rate + $\alpha_t = 1 / n_t(x, a)^{0.51}$. + - Choose the next action via $\varepsilon$-greedy: with probability + $\varepsilon$ pick a random feasible action, otherwise pick the + $\arg\min$ action. + - Decay $\varepsilon$. +3. **Extract the greedy policy** from the final Q-table via + $\sigma(x) = \arg\min_{a \in \Gamma(x)} q(x, a)$. +4. **Compare** the learned policy against the VFI solution. + +### Implementation + +We first define a helper to extract the greedy policy from the Q-table. + +Since the optimal policy minimizes $q$, we use $\arg\min$ rather than $\arg\max$. + +```{code-cell} ipython3 +@numba.jit(nopython=True) +def greedy_policy_from_q_rs(q, K): + """Extract greedy policy from risk-sensitive Q-table (argmin).""" + σ = np.empty(K + 1, dtype=np.int32) + for x in range(K + 1): + best_val = np.inf + best_a = 0 + for a in range(K - x + 1): + if q[x, a] < best_val: + best_val = q[x, a] + best_a = a + σ[x] = best_a + return σ +``` + +The Q-learning loop mirrors the risk-neutral version, with the key changes: +Q-table initialized to ones, the update target uses $\exp(-\gamma R_{t+1}) +\cdot (\min_{a'} q_t)^\beta$, and the behavior policy follows the $\arg\min$. + +```{code-cell} ipython3 +@numba.jit(nopython=True) +def q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init, + ε_init, ε_min, ε_decay, snapshot_steps): + q = np.ones((K + 1, K + 1)) # positive Q-values, initialized to 1 + n = np.zeros((K + 1, K + 1)) # visit counts for learning rate + ε = ε_init + + n_snaps = len(snapshot_steps) + snapshots = np.zeros((n_snaps, K + 1), dtype=np.int32) + snap_idx = 0 + + # Initialize state and action + x = X_init + a = np.random.randint(0, K - x + 1) + + for t in range(n_steps): + # Record policy snapshot if needed + if snap_idx < n_snaps and t == snapshot_steps[snap_idx]: + snapshots[snap_idx] = greedy_policy_from_q_rs(q, K) + snap_idx += 1 + + # === Draw D_{t+1} and observe outcome === + d = np.random.geometric(p) - 1 + reward = min(x, d) - c * a - κ * (a > 0) + x_next = max(x - d, 0) + a + + # === Min over next state (scalar value for update target) === + # Also record the argmin action for use by the behavior policy. + best_next = np.inf + a_next = 0 + for aa in range(K - x_next + 1): + if q[x_next, aa] < best_next: + best_next = q[x_next, aa] + a_next = aa + + # === Risk-sensitive Q-learning update === + target = np.exp(-γ * reward) * best_next ** β + n[x, a] += 1 + α = 1.0 / n[x, a] ** 0.51 + q[x, a] = (1 - α) * q[x, a] + α * target + + # === Behavior policy: ε-greedy (uses a_next, the argmin action) === + x = x_next + if np.random.random() < ε: + a = np.random.randint(0, K - x + 1) + else: + a = a_next + ε = max(ε_min, ε * ε_decay) + + return q, snapshots +``` + +The wrapper function unpacks the model and provides default hyperparameters. + +```{code-cell} ipython3 +def q_learning_rs(model, n_steps=20_000_000, X_init=0, + ε_init=1.0, ε_min=0.01, ε_decay=0.999999, + snapshot_steps=None): + x_values, d_values, ϕ_values, p, c, κ, β, γ = model + K = len(x_values) - 1 + if snapshot_steps is None: + snapshot_steps = np.array([], dtype=np.int64) + return q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init, + ε_init, ε_min, ε_decay, snapshot_steps) +``` + +### Running Q-learning + +We run 20 million steps and take policy snapshots at steps 10,000, 1,000,000, and at the end. + +```{code-cell} ipython3 +np.random.seed(1234) +snap_steps = np.array([10_000, 1_000_000, 19_999_999], dtype=np.int64) +q_table, snapshots = q_learning_rs(model, snapshot_steps=snap_steps) +``` + +### Comparing with the exact solution + +We extract the value function and policy from the final Q-table. + +Since Q-values represent $\mathbb{E}[\exp(-\gamma(\cdots))]$, we recover the +value function via $v_Q(x) = -\frac{1}{\gamma} \ln(\min_{a} q(x, a))$ and the +policy via $\sigma_Q(x) = \arg\min_a q(x, a)$. + +```{code-cell} ipython3 +K = len(x_values) - 1 +γ_base = model.γ +# restrict to feasible actions a ∈ {0, ..., K-x} +v_q = np.array([-(1/γ_base) * np.log(np.min(q_table[x, :K - x + 1])) + for x in range(K + 1)]) +σ_q = np.array([np.argmin(q_table[x, :K - x + 1]) + for x in range(K + 1)]) +``` + +```{code-cell} ipython3 +fig, axes = plt.subplots(1, 2, figsize=(9.6, 3.2)) + +axes[0].plot(x_values, v_star, label="VFI") +axes[0].plot(x_values, v_q, '--', label="Q-learning") +axes[0].set_xlabel("inventory") +axes[0].set_ylabel("value") +axes[0].legend() +axes[0].set_title("Value function") + +axes[1].plot(x_values, σ_star, label="VFI") +axes[1].plot(x_values, σ_q, '--', label="Q-learning") +axes[1].set_xlabel("inventory") +axes[1].set_ylabel("order quantity") +axes[1].legend() +axes[1].set_title("Policy") + +plt.tight_layout() +plt.show() +``` + +### Visualizing learning over time + +The panels below show how the agent's policy evolves during training. + +Each panel simulates an inventory path using the greedy policy extracted from +the Q-table at a given training step, with the same demand sequence throughout. + +The top panel shows the optimal policy from VFI for reference. + +```{code-cell} ipython3 +ts_length = 200 +n_snaps = len(snap_steps) +fig, axes = plt.subplots(n_snaps + 1, 1, figsize=(8, 2.0 * (n_snaps + 1)), + sharex=True) + +X_init = K // 2 +sim_seed = 5678 + +# Optimal policy +np.random.seed(sim_seed) +X_opt = sim_inventories(ts_length, σ_star, model.p, X_init) +axes[0].plot(X_opt, alpha=0.7) +axes[0].set_ylabel("inventory") +axes[0].set_title("Optimal (VFI)") +axes[0].set_ylim(0, K + 2) + +# Q-learning snapshots +for i in range(n_snaps): + σ_snap = snapshots[i] + np.random.seed(sim_seed) + X = sim_inventories(ts_length, σ_snap, model.p, X_init) + axes[i + 1].plot(X, alpha=0.7) + axes[i + 1].set_ylabel("inventory") + axes[i + 1].set_title(f"Step {snap_steps[i]:,}") + axes[i + 1].set_ylim(0, K + 2) + +axes[-1].set_xlabel(r"$t$") +plt.tight_layout() +plt.show() +``` + +After 10,000 steps the agent has barely explored and its policy is erratic. + +By 1,000,000 steps the learned policy begins to resemble the optimal one, and +by step 20 million the inventory dynamics are nearly indistinguishable from the +VFI solution. + +Note that the converged policy maintains lower inventory levels than in the +risk-neutral case (compare with {ref}`inventory_q`), consistent with the +mechanism discussed above: a risk-sensitive agent caps its exposure to demand +variance by holding less stock. + +## Conclusion + +We extended the inventory management problem from {ref}`inventory_q` to +incorporate risk sensitivity via the certainty equivalent operator +$\phi^{-1}(\mathbb{E}[\phi(\cdot)])$ with $\phi(t) = \exp(-\gamma t)$. + +Value function iteration confirms that risk-sensitive firms order less +aggressively, preferring predictable profits over higher but more volatile +returns. + +We then showed that Q-learning can be adapted to the risk-sensitive setting by +working with the transformed Q-factor $q(x,a) = +\mathbb{E}[\exp(-\gamma(\pi + \beta v^*))]$. + +The resulting update rule replaces addition with multiplication and max with +min, but retains the key property of model-free learning: the agent needs only +to observe states, actions, and profits. diff --git a/lectures/rs_inventory_q.py b/lectures/rs_inventory_q.py new file mode 100644 index 000000000..82a1ba74f --- /dev/null +++ b/lectures/rs_inventory_q.py @@ -0,0 +1,767 @@ +# --- +# jupyter: +# jupytext: +# default_lexer: ipython3 +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.18.1 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# (rs_inventory_q)= +# ```{raw} jupyter +#
+# +# QuantEcon +# +#
+# ``` +# +# # Risk-Sensitive Inventory Management via Q-Learning +# +# ```{contents} Contents +# :depth: 2 +# ``` +# +# ## Introduction +# +# In {ref}`inventory_q`, we looked at an inventory management +# problem and solved it with both value function iteration and Q-learning. +# +# In this lecture, we consider a risk-sensitive variation. +# +# Injection of risk-sensitivity acknowledges the fact +# that, in incomplete markets with financial and informational frictions, firms +# typically take risk into account in their decision making. +# +# In other words, the actions of firms are not, in general, risk neutral. +# +# One natural way to handle this is to use a risk-sensitive version of the Bellman +# equation. +# +# We show how the model can be solved using value function iteration. +# +# We then investigate how risk sensitivity affects the optimal policy. + +# %% +import numpy as np +import numba +import matplotlib.pyplot as plt +from typing import NamedTuple + + +# %% [markdown] +# ## The Model +# +# The Bellman equation for the inventory management problem in {ref}`inventory_q` has the form +# +# +# $$ +# v(x) +# = \max_{a \in \Gamma(x)} \mathbb E +# \left[ +# \pi(x, a, D) +# + \beta v(h(x, a, D)) +# \right]. +# $$ +# +# Here $D$ is a random variable with distribution $\phi$. +# +# (Primitives and definitions are the same as in {ref}`inventory_q`.) +# +# The risk-sensitive version of this Bellman equation has the form +# +# $$ +# v(x) +# = \max_{a \in \Gamma(x)} +# \phi^{-1} +# \left\{ +# \mathbb E \phi +# \left[ +# \pi(x, a, D) +# + \beta v(h(x, a, D)) +# \right] +# \right\}, +# $$ +# +# where $\phi(t) = \exp(-\gamma t)$ for fixed $\gamma > 0$. +# +# Since $\phi^{-1}(y) = -\frac{1}{\gamma} \ln(y)$, the Bellman equation becomes +# +# $$ +# v(x) +# = \max_{a \in \Gamma(x)} +# \left( +# -\frac{1}{\gamma} +# \right) +# \ln +# \left\{ +# \sum_d \phi(d) \exp +# \left[ +# -\gamma \left( \pi(x, a, d) + \beta \, v(h(x, a, d)) \right) +# \right] +# \right\}. +# $$ +# +# Here $\phi(d)$ denotes the demand probability mass function, as in {ref}`inventory_q`. +# +# The parameter $\gamma$ controls the degree of risk sensitivity. +# +# As $\gamma \to 0$, the certainty equivalent reduces to the ordinary expectation and we recover the risk-neutral case. +# +# Larger $\gamma$ means more aversion to downside risk. +# +# The Bellman operator, greedy policy, and VFI algorithm all carry over from the +# risk-neutral case, with the expectation replaced by the certainty equivalent. +# +# +# +# ## Solving via Value Function Iteration +# +# ### Model specification +# +# We reuse the same model primitives as in {ref}`inventory_q`, adding $\gamma$ as a parameter. + +# %% +class RSModel(NamedTuple): + x_values: np.ndarray # Inventory values + d_values: np.ndarray # Demand values for summation + ϕ_values: np.ndarray # Demand probabilities + p: float # Demand parameter + c: float # Unit cost + κ: float # Fixed cost + β: float # Discount factor + γ: float # Risk-sensitivity parameter + + +# %% +def create_rs_inventory_model( + K: int = 20, # Max inventory + D_MAX: int = 21, # Demand upper bound for summation + p: float = 0.7, + c: float = 0.2, + κ: float = 0.8, + β: float = 0.98, + γ: float = 1.0 + ) -> RSModel: + + def demand_pdf(p, d): + return (1 - p)**d * p + + d_values = np.arange(D_MAX) + ϕ_values = demand_pdf(p, d_values) + x_values = np.arange(K + 1) + + return RSModel(x_values, d_values, ϕ_values, p, c, κ, β, γ) + + +# %% [markdown] +# ### The Bellman operator +# +# The risk-sensitive Bellman operator replaces the expected value with the certainty equivalent. +# +# For numerical stability, we use the [log-sum-exp trick](https://en.wikipedia.org/wiki/LogSumExp): given values $z_i = \pi(x, a, d_i) + \beta \, v(h(x, a, d_i))$, we compute +# +# $$ +# -\frac{1}{\gamma} \ln \sum_i \phi(d_i) \exp(-\gamma z_i) +# \;=\; +# -\frac{1}{\gamma} +# \left( +# m + \ln \sum_i \phi(d_i) \exp(-\gamma z_i - m) +# \right), +# $$ +# +# where $m = \max_i (-\gamma z_i)$. + +# %% +@numba.jit(nopython=True) +def T_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K): + new_v = np.empty(K + 1) + n_d = len(d_values) + for x in range(K + 1): + best = -np.inf + for a in range(K - x + 1): + # Compute -γ * z_i for each demand realization + exponents = np.empty(n_d) + for i in range(n_d): + d = d_values[i] + x_next = max(x - d, 0) + a + revenue = min(x, d) + cost = c * a + κ * (a > 0) + z_i = revenue - cost + β * v[x_next] + exponents[i] = -γ * z_i + # Log-sum-exp trick for numerical stability + m = np.max(exponents) + weighted_sum = 0.0 + for i in range(n_d): + weighted_sum += ϕ_values[i] * np.exp(exponents[i] - m) + val = -(1.0 / γ) * (m + np.log(weighted_sum)) + if val > best: + best = val + new_v[x] = best + return new_v + + +# %% +def T_rs(v, model): + """The risk-sensitive Bellman operator.""" + x_values, d_values, ϕ_values, p, c, κ, β, γ = model + K = len(x_values) - 1 + return T_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K) + + +# %% [markdown] +# ### Computing the greedy policy +# +# The greedy policy records the maximizing action instead of the maximized value. + +# %% +@numba.jit(nopython=True) +def get_greedy_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K): + σ = np.empty(K + 1, dtype=np.int32) + n_d = len(d_values) + for x in range(K + 1): + best = -np.inf + best_a = 0 + for a in range(K - x + 1): + exponents = np.empty(n_d) + for i in range(n_d): + d = d_values[i] + x_next = max(x - d, 0) + a + revenue = min(x, d) + cost = c * a + κ * (a > 0) + z_i = revenue - cost + β * v[x_next] + exponents[i] = -γ * z_i + m = np.max(exponents) + weighted_sum = 0.0 + for i in range(n_d): + weighted_sum += ϕ_values[i] * np.exp(exponents[i] - m) + val = -(1.0 / γ) * (m + np.log(weighted_sum)) + if val > best: + best = val + best_a = a + σ[x] = best_a + return σ + + +# %% +def get_greedy_rs(v, model): + """Get a v-greedy policy for the risk-sensitive model.""" + x_values, d_values, ϕ_values, p, c, κ, β, γ = model + K = len(x_values) - 1 + return get_greedy_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K) + + +# %% [markdown] +# ### Value function iteration + +# %% +def solve_rs_inventory_model(v_init, model, max_iter=10_000, tol=1e-6): + v = v_init.copy() + i, error = 0, tol + 1 + + while i < max_iter and error > tol: + new_v = T_rs(v, model) + error = np.max(np.abs(new_v - v)) + i += 1 + v = new_v + + print(f"Converged in {i} iterations with error {error:.2e}") + + σ = get_greedy_rs(v, model) + return v, σ + + +# %% [markdown] +# ### Creating and solving an instance + +# %% +model = create_rs_inventory_model() +x_values = model.x_values +n_x = len(x_values) +v_init = np.zeros(n_x) + +# %% +v_star, σ_star = solve_rs_inventory_model(v_init, model) + +# %% [markdown] +# ### Effect of risk sensitivity on the optimal policy +# +# We solve the model for several values of $\gamma$ and compare the resulting policies. +# +# As we will see, a risk-sensitive firm orders less aggressively than a risk-neutral one. + +# %% +γ_values = [0.01, 1.0, 2.0] +results = {} + +for γ in γ_values: + mod = create_rs_inventory_model(γ=γ) + v, σ = solve_rs_inventory_model(np.zeros(n_x), mod) + results[γ] = (v, σ) + +# %% +fig, axes = plt.subplots(1, 2, figsize=(9.6, 3.2)) + +for γ in γ_values: + v, σ = results[γ] + axes[0].plot(x_values, v, label=f"$\\gamma = {γ}$") + axes[1].plot(x_values, σ, label=f"$\\gamma = {γ}$") + +axes[0].set_xlabel("inventory") +axes[0].set_ylabel("value") +axes[0].legend() +axes[0].set_title("Value function") + +axes[1].set_xlabel("inventory") +axes[1].set_ylabel("order quantity") +axes[1].legend() +axes[1].set_title("Policy") + +plt.tight_layout() +plt.show() + + +# %% [markdown] +# ### Simulating the optimal policy +# +# We simulate inventory dynamics under the optimal policy for the baseline $\gamma$. + +# %% +@numba.jit(nopython=True) +def sim_inventories(ts_length, σ, p, X_init=0): + """Simulate inventory dynamics under policy σ.""" + X = np.zeros(ts_length, dtype=np.int32) + X[0] = X_init + for t in range(ts_length - 1): + d = np.random.geometric(p) - 1 + X[t+1] = max(X[t] - d, 0) + σ[X[t]] + return X + + +# %% +fig, axes = plt.subplots(len(γ_values), 1, + figsize=(8, 2.0 * len(γ_values)), + sharex=True) + +ts_length = 200 +sim_seed = 5678 +K = len(x_values) - 1 + +for i, γ in enumerate(γ_values): + v, σ = results[γ] + np.random.seed(sim_seed) + X = sim_inventories(ts_length, σ, model.p, X_init=K // 2) + axes[i].plot(X, alpha=0.7) + axes[i].set_ylabel("inventory") + axes[i].set_title(f"$\\gamma = {γ}$") + axes[i].set_ylim(0, K + 2) + +axes[-1].set_xlabel(r"$t$") +plt.tight_layout() +plt.show() + + +# %% [markdown] +# ## Interpreting the Outcomes +# +# The plots above show that a more risk-sensitive firm (larger $\gamma$) orders +# less inventory and maintains lower stock levels. +# +# At first glance this may seem surprising: wouldn't holding more inventory +# reduce variance by ensuring the firm can always meet demand? +# +# The key is to identify where the randomness in profits actually comes from. +# +# Recall that per-period profit is $\pi(x, a, d) = \min(x, d) - ca - \kappa +# \mathbf{1}\{a > 0\}$. +# +# The ordering cost $ca + \kappa \mathbf{1}\{a > 0\}$ is **deterministic** — it +# is chosen before the demand shock is realized. +# +# So higher ordering shifts the level of profits down but does not affect their +# variance. +# +# The variance comes from **revenue**: $\min(x, D)$. +# +# When inventory $x$ is high, $\min(x, D) \approx D$ for most demand +# realizations — revenue inherits the full variance of demand. +# +# When inventory $x$ is low, $\min(x, D) \approx x$ for most realizations — +# revenue is nearly deterministic, capped at the inventory level. +# +# A risk-sensitive agent therefore prefers lower inventory because it **caps the +# randomness of revenue**. +# +# The agent accepts lower expected sales in exchange for more predictable profits. +# +# There is also a continuation value channel: next-period inventory $\max(x - D, +# 0) + a$ varies with $D$, and higher $x$ means $x - D$ tracks $D$ more +# closely, propagating that variance forward through $v$. +# +# +# ## Q-Learning +# +# We now ask whether the optimal policy can be learned without knowledge of the +# model, as we did in the risk-neutral case in {ref}`inventory_q`. +# +# ### The Q-factor +# +# The first step is to define the Q-factor in a way that is compatible with the +# risk-sensitive Bellman equation. +# +# We define +# +# $$ +# q(x, a) := \mathbb E +# \left[ +# \exp\!\left( +# -\gamma \left( \pi(x, a, D) + \beta \, v^*(h(x, a, D)) \right) +# \right) +# \right]. +# $$ +# +# In words, $q(x, a)$ applies the risk-sensitivity transformation $\phi(t) = +# \exp(-\gamma t)$ inside the expectation, evaluated at the return from taking +# action $a$ in state $x$ and following the optimal policy thereafter. +# +# ### Deriving the Q-factor Bellman equation +# +# Our goal is to obtain a fixed point equation in $q$ alone, eliminating $v^*$. +# +# **Step 1.** Express $v^*$ in terms of $q$. +# +# The risk-sensitive Bellman equation says $v^*(x) = \max_{a \in \Gamma(x)} +# \phi^{-1}(q(x, a))$. +# +# Since $\phi^{-1}(y) = -\frac{1}{\gamma} \ln(y)$ is a **decreasing** function, +# the maximum over $a$ of $\phi^{-1}(q(x, a))$ corresponds to the **minimum** +# over $a$ of $q(x, a)$: +# +# $$ +# v^*(x) +# = \phi^{-1}\!\left(\min_{a \in \Gamma(x)} q(x, a)\right) +# = -\frac{1}{\gamma} \ln\!\left(\min_{a \in \Gamma(x)} q(x, a)\right). +# $$ +# +# Equivalently, +# +# $$ +# \exp(-\gamma \, v^*(x)) = \min_{a \in \Gamma(x)} q(x, a). +# $$ +# +# **Step 2.** Substitute back into the definition of $q$ to eliminate $v^*$. +# +# Expanding the exponential in the definition of $q$, +# +# $$ +# q(x, a) +# = \mathbb E +# \left[ +# \exp(-\gamma \, \pi(x, a, D)) +# \;\cdot\; +# \exp\!\left(-\gamma \beta \, v^*(x')\right) +# \right] +# $$ +# +# where $x' = h(x, a, D)$. +# +# From Step 1, $\exp(-\gamma \, v^*(x')) = \min_{a' \in \Gamma(x')} q(x', a')$, +# so $\exp(-\gamma \beta \, v^*(x')) = \left[\min_{a' \in \Gamma(x')} q(x', +# a')\right]^\beta$. +# +# Substituting, +# +# $$ +# q(x, a) +# = \mathbb E +# \left[ +# \exp(-\gamma \, \pi(x, a, D)) +# \;\cdot\; +# \left(\min_{a' \in \Gamma(x')} q(x', a')\right)^\beta +# \right]. +# $$ +# +# This is a fixed point equation in $q$ alone — $v^*$ has been eliminated. +# +# ### The Q-learning update rule +# +# As in the risk-neutral case, we approximate the fixed point using stochastic +# approximation. +# +# At each step, the agent is in state $x$, takes action $a$, observes profit +# $R_{t+1} = \pi(x, a, D_{t+1})$ and next state $X_{t+1} = h(x, a, D_{t+1})$, +# and updates +# +# $$ +# q_{t+1}(x, a) +# = (1 - \alpha_t) \, q_t(x, a) +# + \alpha_t +# \left[ +# \exp(-\gamma \, R_{t+1}) +# \;\cdot\; +# \left(\min_{a' \in \Gamma(X_{t+1})} q_t(X_{t+1}, a')\right)^\beta +# \right]. +# $$ +# +# The term in brackets is a single-sample estimate of the right-hand side of the +# Q-factor Bellman equation. +# +# The update blends the current estimate with this fresh sample, just as in +# standard Q-learning. +# +# Notice several differences from the risk-neutral case: +# +# - The Q-values are **positive** (expectations of exponentials) rather than signed. +# - The optimal policy is $\sigma(x) = \arg\min_a q(x, a)$ — we **minimize** +# rather than maximize, because $\phi^{-1}$ is decreasing. +# - The observed profit enters through $\exp(-\gamma R_{t+1})$ rather than +# additively. +# - The continuation value enters as a **power** $(\min_{a'} q_t)^\beta$ rather +# than a scaled sum $\beta \cdot \max_{a'} q_t$. +# +# As before, the agent needs only to observe $x$, $a$, $R_{t+1}$, and +# $X_{t+1}$ — no model knowledge is required. +# +# ### Implementation plan +# +# Our implementation follows the same structure as the risk-neutral Q-learning in +# {ref}`inventory_q`, with the modifications above: +# +# 1. **Initialize** the Q-table $q$ to ones (since Q-values are positive) and +# visit counts $n$ to zeros. +# 2. **At each step:** +# - Draw demand $D_{t+1}$ and compute observed profit $R_{t+1}$ and next state +# $X_{t+1}$. +# - Compute $\min_{a'} q_t(X_{t+1}, a')$ over feasible actions (this is a +# scalar for the update target, and the $\arg\min$ action is used by the +# $\varepsilon$-greedy behavior policy). +# - Update $q_t(x, a)$ using the rule above, with learning rate +# $\alpha_t = 1 / n_t(x, a)^{0.51}$. +# - Choose the next action via $\varepsilon$-greedy: with probability +# $\varepsilon$ pick a random feasible action, otherwise pick the +# $\arg\min$ action. +# - Decay $\varepsilon$. +# 3. **Extract the greedy policy** from the final Q-table via +# $\sigma(x) = \arg\min_{a \in \Gamma(x)} q(x, a)$. +# 4. **Compare** the learned policy against the VFI solution. +# +# ### Implementation +# +# We first define a helper to extract the greedy policy from the Q-table. +# +# Since the optimal policy minimizes $q$, we use $\arg\min$ rather than $\arg\max$. + +# %% +@numba.jit(nopython=True) +def greedy_policy_from_q_rs(q, K): + """Extract greedy policy from risk-sensitive Q-table (argmin).""" + σ = np.empty(K + 1, dtype=np.int32) + for x in range(K + 1): + best_val = np.inf + best_a = 0 + for a in range(K - x + 1): + if q[x, a] < best_val: + best_val = q[x, a] + best_a = a + σ[x] = best_a + return σ + + +# %% [markdown] +# The Q-learning loop mirrors the risk-neutral version, with the key changes: +# Q-table initialized to ones, the update target uses $\exp(-\gamma R_{t+1}) +# \cdot (\min_{a'} q_t)^\beta$, and the behavior policy follows the $\arg\min$. + +# %% +@numba.jit(nopython=True) +def q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init, + ε_init, ε_min, ε_decay, snapshot_steps): + q = np.ones((K + 1, K + 1)) # positive Q-values, initialized to 1 + n = np.zeros((K + 1, K + 1)) # visit counts for learning rate + ε = ε_init + + n_snaps = len(snapshot_steps) + snapshots = np.zeros((n_snaps, K + 1), dtype=np.int32) + snap_idx = 0 + + # Initialize state and action + x = X_init + a = np.random.randint(0, K - x + 1) + + for t in range(n_steps): + # Record policy snapshot if needed + if snap_idx < n_snaps and t == snapshot_steps[snap_idx]: + snapshots[snap_idx] = greedy_policy_from_q_rs(q, K) + snap_idx += 1 + + # === Draw D_{t+1} and observe outcome === + d = np.random.geometric(p) - 1 + reward = min(x, d) - c * a - κ * (a > 0) + x_next = max(x - d, 0) + a + + # === Min over next state (scalar value for update target) === + # Also record the argmin action for use by the behavior policy. + best_next = np.inf + a_next = 0 + for aa in range(K - x_next + 1): + if q[x_next, aa] < best_next: + best_next = q[x_next, aa] + a_next = aa + + # === Risk-sensitive Q-learning update === + target = np.exp(-γ * reward) * best_next ** β + n[x, a] += 1 + α = 1.0 / n[x, a] ** 0.51 + q[x, a] = (1 - α) * q[x, a] + α * target + + # === Behavior policy: ε-greedy (uses a_next, the argmin action) === + x = x_next + if np.random.random() < ε: + a = np.random.randint(0, K - x + 1) + else: + a = a_next + ε = max(ε_min, ε * ε_decay) + + return q, snapshots + + +# %% [markdown] +# The wrapper function unpacks the model and provides default hyperparameters. + +# %% +def q_learning_rs(model, n_steps=20_000_000, X_init=0, + ε_init=1.0, ε_min=0.01, ε_decay=0.999999, + snapshot_steps=None): + x_values, d_values, ϕ_values, p, c, κ, β, γ = model + K = len(x_values) - 1 + if snapshot_steps is None: + snapshot_steps = np.array([], dtype=np.int64) + return q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init, + ε_init, ε_min, ε_decay, snapshot_steps) + + +# %% [markdown] +# ### Running Q-learning +# +# We run 20 million steps and take policy snapshots at steps 10,000, 1,000,000, and at the end. + +# %% +np.random.seed(1234) +snap_steps = np.array([10_000, 1_000_000, 19_999_999], dtype=np.int64) +q_table, snapshots = q_learning_rs(model, snapshot_steps=snap_steps) + +# %% [markdown] +# ### Comparing with the exact solution +# +# We extract the value function and policy from the final Q-table. +# +# Since Q-values represent $\mathbb{E}[\exp(-\gamma(\cdots))]$, we recover the +# value function via $v_Q(x) = -\frac{1}{\gamma} \ln(\min_{a} q(x, a))$ and the +# policy via $\sigma_Q(x) = \arg\min_a q(x, a)$. + +# %% +K = len(x_values) - 1 +γ_base = model.γ +# restrict to feasible actions a ∈ {0, ..., K-x} +v_q = np.array([-(1/γ_base) * np.log(np.min(q_table[x, :K - x + 1])) + for x in range(K + 1)]) +σ_q = np.array([np.argmin(q_table[x, :K - x + 1]) + for x in range(K + 1)]) + +# %% +fig, axes = plt.subplots(1, 2, figsize=(9.6, 3.2)) + +axes[0].plot(x_values, v_star, label="VFI") +axes[0].plot(x_values, v_q, '--', label="Q-learning") +axes[0].set_xlabel("inventory") +axes[0].set_ylabel("value") +axes[0].legend() +axes[0].set_title("Value function") + +axes[1].plot(x_values, σ_star, label="VFI") +axes[1].plot(x_values, σ_q, '--', label="Q-learning") +axes[1].set_xlabel("inventory") +axes[1].set_ylabel("order quantity") +axes[1].legend() +axes[1].set_title("Policy") + +plt.tight_layout() +plt.show() + +# %% [markdown] +# ### Visualizing learning over time +# +# The panels below show how the agent's policy evolves during training. +# +# Each panel simulates an inventory path using the greedy policy extracted from +# the Q-table at a given training step, with the same demand sequence throughout. +# +# The top panel shows the optimal policy from VFI for reference. + +# %% +ts_length = 200 +n_snaps = len(snap_steps) +fig, axes = plt.subplots(n_snaps + 1, 1, figsize=(8, 2.0 * (n_snaps + 1)), + sharex=True) + +X_init = K // 2 +sim_seed = 5678 + +# Optimal policy +np.random.seed(sim_seed) +X_opt = sim_inventories(ts_length, σ_star, model.p, X_init) +axes[0].plot(X_opt, alpha=0.7) +axes[0].set_ylabel("inventory") +axes[0].set_title("Optimal (VFI)") +axes[0].set_ylim(0, K + 2) + +# Q-learning snapshots +for i in range(n_snaps): + σ_snap = snapshots[i] + np.random.seed(sim_seed) + X = sim_inventories(ts_length, σ_snap, model.p, X_init) + axes[i + 1].plot(X, alpha=0.7) + axes[i + 1].set_ylabel("inventory") + axes[i + 1].set_title(f"Step {snap_steps[i]:,}") + axes[i + 1].set_ylim(0, K + 2) + +axes[-1].set_xlabel(r"$t$") +plt.tight_layout() +plt.show() + +# %% [markdown] +# After 10,000 steps the agent has barely explored and its policy is erratic. +# +# By 1,000,000 steps the learned policy begins to resemble the optimal one, and +# by step 20 million the inventory dynamics are nearly indistinguishable from the +# VFI solution. +# +# Note that the converged policy maintains lower inventory levels than in the +# risk-neutral case (compare with {ref}`inventory_q`), consistent with the +# mechanism discussed above: a risk-sensitive agent caps its exposure to demand +# variance by holding less stock. +# +# ## Conclusion +# +# We extended the inventory management problem from {ref}`inventory_q` to +# incorporate risk sensitivity via the certainty equivalent operator +# $\phi^{-1}(\mathbb{E}[\phi(\cdot)])$ with $\phi(t) = \exp(-\gamma t)$. +# +# Value function iteration confirms that risk-sensitive firms order less +# aggressively, preferring predictable profits over higher but more volatile +# returns. +# +# We then showed that Q-learning can be adapted to the risk-sensitive setting by +# working with the transformed Q-factor $q(x,a) = +# \mathbb{E}[\exp(-\gamma(\pi + \beta v^*))]$. +# +# The resulting update rule replaces addition with multiplication and max with +# min, but retains the key property of model-free learning: the agent needs only +# to observe states, actions, and profits. From 8aee45af1523c5aaaf646dad6adf497971598c33 Mon Sep 17 00:00:00 2001 From: John Stachurski Date: Mon, 9 Mar 2026 15:31:15 +1100 Subject: [PATCH 2/8] Add rs_inventory_q to table of contents Co-Authored-By: Claude Opus 4.6 --- lectures/_toc.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/lectures/_toc.yml b/lectures/_toc.yml index cd51e010b..f43d7f3dc 100644 --- a/lectures/_toc.yml +++ b/lectures/_toc.yml @@ -77,6 +77,7 @@ parts: numbered: true chapters: - file: inventory_q + - file: rs_inventory_q - file: mccall_q - caption: Introduction to Optimal Savings numbered: true From 9ce92390a01bc1cb719f6edc987335b036395642 Mon Sep 17 00:00:00 2001 From: John Stachurski Date: Mon, 9 Mar 2026 15:44:05 +1100 Subject: [PATCH 3/8] Remove generated ipynb and py files from tracking The build system generates notebooks from the .md source. Having .ipynb and .py files in the repo causes duplicate document warnings and cross-reference failures. Co-Authored-By: Claude Opus 4.6 --- lectures/rs_inventory_q.ipynb | 977 ---------------------------------- lectures/rs_inventory_q.py | 767 -------------------------- 2 files changed, 1744 deletions(-) delete mode 100644 lectures/rs_inventory_q.ipynb delete mode 100644 lectures/rs_inventory_q.py diff --git a/lectures/rs_inventory_q.ipynb b/lectures/rs_inventory_q.ipynb deleted file mode 100644 index 99dc4f676..000000000 --- a/lectures/rs_inventory_q.ipynb +++ /dev/null @@ -1,977 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "17be60e4", - "metadata": {}, - "source": [ - "(rs_inventory_q)=\n", - "```{raw} jupyter\n", - "
\n", - " \n", - " \"QuantEcon\"\n", - " \n", - "
\n", - "```\n", - "\n", - "# Risk-Sensitive Inventory Management via Q-Learning\n", - "\n", - "```{contents} Contents\n", - ":depth: 2\n", - "```\n", - "\n", - "## Introduction\n", - "\n", - "In {ref}`inventory_q`, we looked at an inventory management\n", - "problem and solved it with both value function iteration and Q-learning.\n", - "\n", - "In this lecture, we consider a risk-sensitive variation.\n", - "\n", - "Injection of risk-sensitivity acknowledges the fact\n", - "that, in incomplete markets with financial and informational frictions, firms\n", - "typically take risk into account in their decision making.\n", - "\n", - "In other words, the actions of firms are not, in general, risk neutral.\n", - "\n", - "One natural way to handle this is to use a risk-sensitive version of the Bellman\n", - "equation.\n", - "\n", - "We show how the model can be solved using value function iteration.\n", - "\n", - "We then investigate how risk sensitivity affects the optimal policy." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "95f2dc54", - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import numba\n", - "import matplotlib.pyplot as plt\n", - "from typing import NamedTuple" - ] - }, - { - "cell_type": "markdown", - "id": "acc004dc", - "metadata": {}, - "source": [ - "## The Model\n", - "\n", - "The Bellman equation for the inventory management problem in {ref}`inventory_q` has the form\n", - "\n", - "\n", - "$$\n", - " v(x)\n", - " = \\max_{a \\in \\Gamma(x)} \\mathbb E\n", - " \\left[\n", - " \\pi(x, a, D)\n", - " + \\beta v(h(x, a, D))\n", - " \\right].\n", - "$$\n", - "\n", - "Here $D$ is a random variable with distribution $\\phi$.\n", - "\n", - "(Primitives and definitions are the same as in {ref}`inventory_q`.)\n", - "\n", - "The risk-sensitive version of this Bellman equation has the form\n", - "\n", - "$$\n", - " v(x)\n", - " = \\max_{a \\in \\Gamma(x)}\n", - " \\phi^{-1}\n", - " \\left\\{\n", - " \\mathbb E \\phi\n", - " \\left[\n", - " \\pi(x, a, D)\n", - " + \\beta v(h(x, a, D))\n", - " \\right]\n", - " \\right\\},\n", - "$$\n", - "\n", - "where $\\phi(t) = \\exp(-\\gamma t)$ for fixed $\\gamma > 0$.\n", - "\n", - "Since $\\phi^{-1}(y) = -\\frac{1}{\\gamma} \\ln(y)$, the Bellman equation becomes\n", - "\n", - "$$\n", - " v(x)\n", - " = \\max_{a \\in \\Gamma(x)}\n", - " \\left(\n", - " -\\frac{1}{\\gamma}\n", - " \\right)\n", - " \\ln\n", - " \\left\\{\n", - " \\sum_d \\phi(d) \\exp\n", - " \\left[\n", - " -\\gamma \\left( \\pi(x, a, d) + \\beta \\, v(h(x, a, d)) \\right)\n", - " \\right]\n", - " \\right\\}.\n", - "$$\n", - "\n", - "Here $\\phi(d)$ denotes the demand probability mass function, as in {ref}`inventory_q`.\n", - "\n", - "The parameter $\\gamma$ controls the degree of risk sensitivity.\n", - "\n", - "As $\\gamma \\to 0$, the certainty equivalent reduces to the ordinary expectation and we recover the risk-neutral case.\n", - "\n", - "Larger $\\gamma$ means more aversion to downside risk.\n", - "\n", - "The Bellman operator, greedy policy, and VFI algorithm all carry over from the\n", - "risk-neutral case, with the expectation replaced by the certainty equivalent.\n", - "\n", - "\n", - "\n", - "## Solving via Value Function Iteration\n", - "\n", - "### Model specification\n", - "\n", - "We reuse the same model primitives as in {ref}`inventory_q`, adding $\\gamma$ as a parameter." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e7909494", - "metadata": {}, - "outputs": [], - "source": [ - "class RSModel(NamedTuple):\n", - " x_values: np.ndarray # Inventory values\n", - " d_values: np.ndarray # Demand values for summation\n", - " ϕ_values: np.ndarray # Demand probabilities\n", - " p: float # Demand parameter\n", - " c: float # Unit cost\n", - " κ: float # Fixed cost\n", - " β: float # Discount factor\n", - " γ: float # Risk-sensitivity parameter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c8cf486f", - "metadata": {}, - "outputs": [], - "source": [ - "def create_rs_inventory_model(\n", - " K: int = 20, # Max inventory\n", - " D_MAX: int = 21, # Demand upper bound for summation\n", - " p: float = 0.7,\n", - " c: float = 0.2,\n", - " κ: float = 0.8,\n", - " β: float = 0.98,\n", - " γ: float = 1.0\n", - " ) -> RSModel:\n", - "\n", - " def demand_pdf(p, d):\n", - " return (1 - p)**d * p\n", - "\n", - " d_values = np.arange(D_MAX)\n", - " ϕ_values = demand_pdf(p, d_values)\n", - " x_values = np.arange(K + 1)\n", - "\n", - " return RSModel(x_values, d_values, ϕ_values, p, c, κ, β, γ)" - ] - }, - { - "cell_type": "markdown", - "id": "060e0b9c", - "metadata": {}, - "source": [ - "### The Bellman operator\n", - "\n", - "The risk-sensitive Bellman operator replaces the expected value with the certainty equivalent.\n", - "\n", - "For numerical stability, we use the [log-sum-exp trick](https://en.wikipedia.org/wiki/LogSumExp): given values $z_i = \\pi(x, a, d_i) + \\beta \\, v(h(x, a, d_i))$, we compute\n", - "\n", - "$$\n", - " -\\frac{1}{\\gamma} \\ln \\sum_i \\phi(d_i) \\exp(-\\gamma z_i)\n", - " \\;=\\;\n", - " -\\frac{1}{\\gamma}\n", - " \\left(\n", - " m + \\ln \\sum_i \\phi(d_i) \\exp(-\\gamma z_i - m)\n", - " \\right),\n", - "$$\n", - "\n", - "where $m = \\max_i (-\\gamma z_i)$." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8ecff396", - "metadata": {}, - "outputs": [], - "source": [ - "@numba.jit(nopython=True)\n", - "def T_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K):\n", - " new_v = np.empty(K + 1)\n", - " n_d = len(d_values)\n", - " for x in range(K + 1):\n", - " best = -np.inf\n", - " for a in range(K - x + 1):\n", - " # Compute -γ * z_i for each demand realization\n", - " exponents = np.empty(n_d)\n", - " for i in range(n_d):\n", - " d = d_values[i]\n", - " x_next = max(x - d, 0) + a\n", - " revenue = min(x, d)\n", - " cost = c * a + κ * (a > 0)\n", - " z_i = revenue - cost + β * v[x_next]\n", - " exponents[i] = -γ * z_i\n", - " # Log-sum-exp trick for numerical stability\n", - " m = np.max(exponents)\n", - " weighted_sum = 0.0\n", - " for i in range(n_d):\n", - " weighted_sum += ϕ_values[i] * np.exp(exponents[i] - m)\n", - " val = -(1.0 / γ) * (m + np.log(weighted_sum))\n", - " if val > best:\n", - " best = val\n", - " new_v[x] = best\n", - " return new_v" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7fdd1618", - "metadata": {}, - "outputs": [], - "source": [ - "def T_rs(v, model):\n", - " \"\"\"The risk-sensitive Bellman operator.\"\"\"\n", - " x_values, d_values, ϕ_values, p, c, κ, β, γ = model\n", - " K = len(x_values) - 1\n", - " return T_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K)" - ] - }, - { - "cell_type": "markdown", - "id": "03fbea31", - "metadata": {}, - "source": [ - "### Computing the greedy policy\n", - "\n", - "The greedy policy records the maximizing action instead of the maximized value." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7eeb7faf", - "metadata": {}, - "outputs": [], - "source": [ - "@numba.jit(nopython=True)\n", - "def get_greedy_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K):\n", - " σ = np.empty(K + 1, dtype=np.int32)\n", - " n_d = len(d_values)\n", - " for x in range(K + 1):\n", - " best = -np.inf\n", - " best_a = 0\n", - " for a in range(K - x + 1):\n", - " exponents = np.empty(n_d)\n", - " for i in range(n_d):\n", - " d = d_values[i]\n", - " x_next = max(x - d, 0) + a\n", - " revenue = min(x, d)\n", - " cost = c * a + κ * (a > 0)\n", - " z_i = revenue - cost + β * v[x_next]\n", - " exponents[i] = -γ * z_i\n", - " m = np.max(exponents)\n", - " weighted_sum = 0.0\n", - " for i in range(n_d):\n", - " weighted_sum += ϕ_values[i] * np.exp(exponents[i] - m)\n", - " val = -(1.0 / γ) * (m + np.log(weighted_sum))\n", - " if val > best:\n", - " best = val\n", - " best_a = a\n", - " σ[x] = best_a\n", - " return σ" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c5473245", - "metadata": {}, - "outputs": [], - "source": [ - "def get_greedy_rs(v, model):\n", - " \"\"\"Get a v-greedy policy for the risk-sensitive model.\"\"\"\n", - " x_values, d_values, ϕ_values, p, c, κ, β, γ = model\n", - " K = len(x_values) - 1\n", - " return get_greedy_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K)" - ] - }, - { - "cell_type": "markdown", - "id": "a6a3b8be", - "metadata": {}, - "source": [ - "### Value function iteration" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ea93ea8", - "metadata": {}, - "outputs": [], - "source": [ - "def solve_rs_inventory_model(v_init, model, max_iter=10_000, tol=1e-6):\n", - " v = v_init.copy()\n", - " i, error = 0, tol + 1\n", - "\n", - " while i < max_iter and error > tol:\n", - " new_v = T_rs(v, model)\n", - " error = np.max(np.abs(new_v - v))\n", - " i += 1\n", - " v = new_v\n", - "\n", - " print(f\"Converged in {i} iterations with error {error:.2e}\")\n", - "\n", - " σ = get_greedy_rs(v, model)\n", - " return v, σ" - ] - }, - { - "cell_type": "markdown", - "id": "962b1a57", - "metadata": {}, - "source": [ - "### Creating and solving an instance" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f6fe6712", - "metadata": {}, - "outputs": [], - "source": [ - "model = create_rs_inventory_model()\n", - "x_values = model.x_values\n", - "n_x = len(x_values)\n", - "v_init = np.zeros(n_x)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3dc85cec", - "metadata": {}, - "outputs": [], - "source": [ - "v_star, σ_star = solve_rs_inventory_model(v_init, model)" - ] - }, - { - "cell_type": "markdown", - "id": "24295d26", - "metadata": {}, - "source": [ - "### Effect of risk sensitivity on the optimal policy\n", - "\n", - "We solve the model for several values of $\\gamma$ and compare the resulting policies.\n", - "\n", - "As we will see, a risk-sensitive firm orders less aggressively than a risk-neutral one." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c599f91f", - "metadata": {}, - "outputs": [], - "source": [ - "γ_values = [0.01, 1.0, 2.0]\n", - "results = {}\n", - "\n", - "for γ in γ_values:\n", - " mod = create_rs_inventory_model(γ=γ)\n", - " v, σ = solve_rs_inventory_model(np.zeros(n_x), mod)\n", - " results[γ] = (v, σ)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bfb69b17", - "metadata": {}, - "outputs": [], - "source": [ - "fig, axes = plt.subplots(1, 2, figsize=(9.6, 3.2))\n", - "\n", - "for γ in γ_values:\n", - " v, σ = results[γ]\n", - " axes[0].plot(x_values, v, label=f\"$\\\\gamma = {γ}$\")\n", - " axes[1].plot(x_values, σ, label=f\"$\\\\gamma = {γ}$\")\n", - "\n", - "axes[0].set_xlabel(\"inventory\")\n", - "axes[0].set_ylabel(\"value\")\n", - "axes[0].legend()\n", - "axes[0].set_title(\"Value function\")\n", - "\n", - "axes[1].set_xlabel(\"inventory\")\n", - "axes[1].set_ylabel(\"order quantity\")\n", - "axes[1].legend()\n", - "axes[1].set_title(\"Policy\")\n", - "\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "55ed320c", - "metadata": {}, - "source": [ - "### Simulating the optimal policy\n", - "\n", - "We simulate inventory dynamics under the optimal policy for the baseline $\\gamma$." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7bfdd38c", - "metadata": {}, - "outputs": [], - "source": [ - "@numba.jit(nopython=True)\n", - "def sim_inventories(ts_length, σ, p, X_init=0):\n", - " \"\"\"Simulate inventory dynamics under policy σ.\"\"\"\n", - " X = np.zeros(ts_length, dtype=np.int32)\n", - " X[0] = X_init\n", - " for t in range(ts_length - 1):\n", - " d = np.random.geometric(p) - 1\n", - " X[t+1] = max(X[t] - d, 0) + σ[X[t]]\n", - " return X" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "993bf7d8", - "metadata": {}, - "outputs": [], - "source": [ - "fig, axes = plt.subplots(len(γ_values), 1,\n", - " figsize=(8, 2.0 * len(γ_values)),\n", - " sharex=True)\n", - "\n", - "ts_length = 200\n", - "sim_seed = 5678\n", - "K = len(x_values) - 1\n", - "\n", - "for i, γ in enumerate(γ_values):\n", - " v, σ = results[γ]\n", - " np.random.seed(sim_seed)\n", - " X = sim_inventories(ts_length, σ, model.p, X_init=K // 2)\n", - " axes[i].plot(X, alpha=0.7)\n", - " axes[i].set_ylabel(\"inventory\")\n", - " axes[i].set_title(f\"$\\\\gamma = {γ}$\")\n", - " axes[i].set_ylim(0, K + 2)\n", - "\n", - "axes[-1].set_xlabel(r\"$t$\")\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "f33d501a", - "metadata": {}, - "source": [ - "## Interpreting the Outcomes\n", - "\n", - "The plots above show that a more risk-sensitive firm (larger $\\gamma$) orders\n", - "less inventory and maintains lower stock levels.\n", - "\n", - "At first glance this may seem surprising: wouldn't holding more inventory\n", - "reduce variance by ensuring the firm can always meet demand?\n", - "\n", - "The key is to identify where the randomness in profits actually comes from.\n", - "\n", - "Recall that per-period profit is $\\pi(x, a, d) = \\min(x, d) - ca - \\kappa\n", - "\\mathbf{1}\\{a > 0\\}$.\n", - "\n", - "The ordering cost $ca + \\kappa \\mathbf{1}\\{a > 0\\}$ is **deterministic** — it\n", - "is chosen before the demand shock is realized.\n", - "\n", - "So higher ordering shifts the level of profits down but does not affect their\n", - "variance.\n", - "\n", - "The variance comes from **revenue**: $\\min(x, D)$.\n", - "\n", - "When inventory $x$ is high, $\\min(x, D) \\approx D$ for most demand\n", - "realizations — revenue inherits the full variance of demand.\n", - "\n", - "When inventory $x$ is low, $\\min(x, D) \\approx x$ for most realizations —\n", - "revenue is nearly deterministic, capped at the inventory level.\n", - "\n", - "A risk-sensitive agent therefore prefers lower inventory because it **caps the\n", - "randomness of revenue**.\n", - "\n", - "The agent accepts lower expected sales in exchange for more predictable profits.\n", - "\n", - "There is also a continuation value channel: next-period inventory $\\max(x - D,\n", - "0) + a$ varies with $D$, and higher $x$ means $x - D$ tracks $D$ more\n", - "closely, propagating that variance forward through $v$.\n", - "\n", - "\n", - "## Q-Learning\n", - "\n", - "We now ask whether the optimal policy can be learned without knowledge of the\n", - "model, as we did in the risk-neutral case in {ref}`inventory_q`.\n", - "\n", - "### The Q-factor\n", - "\n", - "The first step is to define the Q-factor in a way that is compatible with the\n", - "risk-sensitive Bellman equation.\n", - "\n", - "We define\n", - "\n", - "$$\n", - " q(x, a) := \\mathbb E\n", - " \\left[\n", - " \\exp\\!\\left(\n", - " -\\gamma \\left( \\pi(x, a, D) + \\beta \\, v^*(h(x, a, D)) \\right)\n", - " \\right)\n", - " \\right].\n", - "$$\n", - "\n", - "In words, $q(x, a)$ applies the risk-sensitivity transformation $\\phi(t) =\n", - "\\exp(-\\gamma t)$ inside the expectation, evaluated at the return from taking\n", - "action $a$ in state $x$ and following the optimal policy thereafter.\n", - "\n", - "### Deriving the Q-factor Bellman equation\n", - "\n", - "Our goal is to obtain a fixed point equation in $q$ alone, eliminating $v^*$.\n", - "\n", - "**Step 1.** Express $v^*$ in terms of $q$.\n", - "\n", - "The risk-sensitive Bellman equation says $v^*(x) = \\max_{a \\in \\Gamma(x)}\n", - "\\phi^{-1}(q(x, a))$.\n", - "\n", - "Since $\\phi^{-1}(y) = -\\frac{1}{\\gamma} \\ln(y)$ is a **decreasing** function,\n", - "the maximum over $a$ of $\\phi^{-1}(q(x, a))$ corresponds to the **minimum**\n", - "over $a$ of $q(x, a)$:\n", - "\n", - "$$\n", - " v^*(x)\n", - " = \\phi^{-1}\\!\\left(\\min_{a \\in \\Gamma(x)} q(x, a)\\right)\n", - " = -\\frac{1}{\\gamma} \\ln\\!\\left(\\min_{a \\in \\Gamma(x)} q(x, a)\\right).\n", - "$$\n", - "\n", - "Equivalently,\n", - "\n", - "$$\n", - " \\exp(-\\gamma \\, v^*(x)) = \\min_{a \\in \\Gamma(x)} q(x, a).\n", - "$$\n", - "\n", - "**Step 2.** Substitute back into the definition of $q$ to eliminate $v^*$.\n", - "\n", - "Expanding the exponential in the definition of $q$,\n", - "\n", - "$$\n", - " q(x, a)\n", - " = \\mathbb E\n", - " \\left[\n", - " \\exp(-\\gamma \\, \\pi(x, a, D))\n", - " \\;\\cdot\\;\n", - " \\exp\\!\\left(-\\gamma \\beta \\, v^*(x')\\right)\n", - " \\right]\n", - "$$\n", - "\n", - "where $x' = h(x, a, D)$.\n", - "\n", - "From Step 1, $\\exp(-\\gamma \\, v^*(x')) = \\min_{a' \\in \\Gamma(x')} q(x', a')$,\n", - "so $\\exp(-\\gamma \\beta \\, v^*(x')) = \\left[\\min_{a' \\in \\Gamma(x')} q(x',\n", - "a')\\right]^\\beta$.\n", - "\n", - "Substituting,\n", - "\n", - "$$\n", - " q(x, a)\n", - " = \\mathbb E\n", - " \\left[\n", - " \\exp(-\\gamma \\, \\pi(x, a, D))\n", - " \\;\\cdot\\;\n", - " \\left(\\min_{a' \\in \\Gamma(x')} q(x', a')\\right)^\\beta\n", - " \\right].\n", - "$$\n", - "\n", - "This is a fixed point equation in $q$ alone — $v^*$ has been eliminated.\n", - "\n", - "### The Q-learning update rule\n", - "\n", - "As in the risk-neutral case, we approximate the fixed point using stochastic\n", - "approximation.\n", - "\n", - "At each step, the agent is in state $x$, takes action $a$, observes profit\n", - "$R_{t+1} = \\pi(x, a, D_{t+1})$ and next state $X_{t+1} = h(x, a, D_{t+1})$,\n", - "and updates\n", - "\n", - "$$\n", - " q_{t+1}(x, a)\n", - " = (1 - \\alpha_t) \\, q_t(x, a)\n", - " + \\alpha_t\n", - " \\left[\n", - " \\exp(-\\gamma \\, R_{t+1})\n", - " \\;\\cdot\\;\n", - " \\left(\\min_{a' \\in \\Gamma(X_{t+1})} q_t(X_{t+1}, a')\\right)^\\beta\n", - " \\right].\n", - "$$\n", - "\n", - "The term in brackets is a single-sample estimate of the right-hand side of the\n", - "Q-factor Bellman equation.\n", - "\n", - "The update blends the current estimate with this fresh sample, just as in\n", - "standard Q-learning.\n", - "\n", - "Notice several differences from the risk-neutral case:\n", - "\n", - "- The Q-values are **positive** (expectations of exponentials) rather than signed.\n", - "- The optimal policy is $\\sigma(x) = \\arg\\min_a q(x, a)$ — we **minimize**\n", - " rather than maximize, because $\\phi^{-1}$ is decreasing.\n", - "- The observed profit enters through $\\exp(-\\gamma R_{t+1})$ rather than\n", - " additively.\n", - "- The continuation value enters as a **power** $(\\min_{a'} q_t)^\\beta$ rather\n", - " than a scaled sum $\\beta \\cdot \\max_{a'} q_t$.\n", - "\n", - "As before, the agent needs only to observe $x$, $a$, $R_{t+1}$, and\n", - "$X_{t+1}$ — no model knowledge is required.\n", - "\n", - "### Implementation plan\n", - "\n", - "Our implementation follows the same structure as the risk-neutral Q-learning in\n", - "{ref}`inventory_q`, with the modifications above:\n", - "\n", - "1. **Initialize** the Q-table $q$ to ones (since Q-values are positive) and\n", - " visit counts $n$ to zeros.\n", - "2. **At each step:**\n", - " - Draw demand $D_{t+1}$ and compute observed profit $R_{t+1}$ and next state\n", - " $X_{t+1}$.\n", - " - Compute $\\min_{a'} q_t(X_{t+1}, a')$ over feasible actions (this is a\n", - " scalar for the update target, and the $\\arg\\min$ action is used by the\n", - " $\\varepsilon$-greedy behavior policy).\n", - " - Update $q_t(x, a)$ using the rule above, with learning rate\n", - " $\\alpha_t = 1 / n_t(x, a)^{0.51}$.\n", - " - Choose the next action via $\\varepsilon$-greedy: with probability\n", - " $\\varepsilon$ pick a random feasible action, otherwise pick the\n", - " $\\arg\\min$ action.\n", - " - Decay $\\varepsilon$.\n", - "3. **Extract the greedy policy** from the final Q-table via\n", - " $\\sigma(x) = \\arg\\min_{a \\in \\Gamma(x)} q(x, a)$.\n", - "4. **Compare** the learned policy against the VFI solution.\n", - "\n", - "### Implementation\n", - "\n", - "We first define a helper to extract the greedy policy from the Q-table.\n", - "\n", - "Since the optimal policy minimizes $q$, we use $\\arg\\min$ rather than $\\arg\\max$." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ed938977", - "metadata": {}, - "outputs": [], - "source": [ - "@numba.jit(nopython=True)\n", - "def greedy_policy_from_q_rs(q, K):\n", - " \"\"\"Extract greedy policy from risk-sensitive Q-table (argmin).\"\"\"\n", - " σ = np.empty(K + 1, dtype=np.int32)\n", - " for x in range(K + 1):\n", - " best_val = np.inf\n", - " best_a = 0\n", - " for a in range(K - x + 1):\n", - " if q[x, a] < best_val:\n", - " best_val = q[x, a]\n", - " best_a = a\n", - " σ[x] = best_a\n", - " return σ" - ] - }, - { - "cell_type": "markdown", - "id": "db9043ac", - "metadata": {}, - "source": [ - "The Q-learning loop mirrors the risk-neutral version, with the key changes:\n", - "Q-table initialized to ones, the update target uses $\\exp(-\\gamma R_{t+1})\n", - "\\cdot (\\min_{a'} q_t)^\\beta$, and the behavior policy follows the $\\arg\\min$." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "73fcd434", - "metadata": {}, - "outputs": [], - "source": [ - "@numba.jit(nopython=True)\n", - "def q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init,\n", - " ε_init, ε_min, ε_decay, snapshot_steps):\n", - " q = np.ones((K + 1, K + 1)) # positive Q-values, initialized to 1\n", - " n = np.zeros((K + 1, K + 1)) # visit counts for learning rate\n", - " ε = ε_init\n", - "\n", - " n_snaps = len(snapshot_steps)\n", - " snapshots = np.zeros((n_snaps, K + 1), dtype=np.int32)\n", - " snap_idx = 0\n", - "\n", - " # Initialize state and action\n", - " x = X_init\n", - " a = np.random.randint(0, K - x + 1)\n", - "\n", - " for t in range(n_steps):\n", - " # Record policy snapshot if needed\n", - " if snap_idx < n_snaps and t == snapshot_steps[snap_idx]:\n", - " snapshots[snap_idx] = greedy_policy_from_q_rs(q, K)\n", - " snap_idx += 1\n", - "\n", - " # === Draw D_{t+1} and observe outcome ===\n", - " d = np.random.geometric(p) - 1\n", - " reward = min(x, d) - c * a - κ * (a > 0)\n", - " x_next = max(x - d, 0) + a\n", - "\n", - " # === Min over next state (scalar value for update target) ===\n", - " # Also record the argmin action for use by the behavior policy.\n", - " best_next = np.inf\n", - " a_next = 0\n", - " for aa in range(K - x_next + 1):\n", - " if q[x_next, aa] < best_next:\n", - " best_next = q[x_next, aa]\n", - " a_next = aa\n", - "\n", - " # === Risk-sensitive Q-learning update ===\n", - " target = np.exp(-γ * reward) * best_next ** β\n", - " n[x, a] += 1\n", - " α = 1.0 / n[x, a] ** 0.51\n", - " q[x, a] = (1 - α) * q[x, a] + α * target\n", - "\n", - " # === Behavior policy: ε-greedy (uses a_next, the argmin action) ===\n", - " x = x_next\n", - " if np.random.random() < ε:\n", - " a = np.random.randint(0, K - x + 1)\n", - " else:\n", - " a = a_next\n", - " ε = max(ε_min, ε * ε_decay)\n", - "\n", - " return q, snapshots" - ] - }, - { - "cell_type": "markdown", - "id": "72492777", - "metadata": {}, - "source": [ - "The wrapper function unpacks the model and provides default hyperparameters." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8cd9e8c5", - "metadata": {}, - "outputs": [], - "source": [ - "def q_learning_rs(model, n_steps=20_000_000, X_init=0,\n", - " ε_init=1.0, ε_min=0.01, ε_decay=0.999999,\n", - " snapshot_steps=None):\n", - " x_values, d_values, ϕ_values, p, c, κ, β, γ = model\n", - " K = len(x_values) - 1\n", - " if snapshot_steps is None:\n", - " snapshot_steps = np.array([], dtype=np.int64)\n", - " return q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init,\n", - " ε_init, ε_min, ε_decay, snapshot_steps)" - ] - }, - { - "cell_type": "markdown", - "id": "9b095eef", - "metadata": {}, - "source": [ - "### Running Q-learning\n", - "\n", - "We run 20 million steps and take policy snapshots at steps 10,000, 1,000,000, and at the end." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "38386ce1", - "metadata": {}, - "outputs": [], - "source": [ - "np.random.seed(1234)\n", - "snap_steps = np.array([10_000, 1_000_000, 19_999_999], dtype=np.int64)\n", - "q_table, snapshots = q_learning_rs(model, snapshot_steps=snap_steps)" - ] - }, - { - "cell_type": "markdown", - "id": "985c208b", - "metadata": {}, - "source": [ - "### Comparing with the exact solution\n", - "\n", - "We extract the value function and policy from the final Q-table.\n", - "\n", - "Since Q-values represent $\\mathbb{E}[\\exp(-\\gamma(\\cdots))]$, we recover the\n", - "value function via $v_Q(x) = -\\frac{1}{\\gamma} \\ln(\\min_{a} q(x, a))$ and the\n", - "policy via $\\sigma_Q(x) = \\arg\\min_a q(x, a)$." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dccdb8f4", - "metadata": {}, - "outputs": [], - "source": [ - "K = len(x_values) - 1\n", - "γ_base = model.γ\n", - "# restrict to feasible actions a ∈ {0, ..., K-x}\n", - "v_q = np.array([-(1/γ_base) * np.log(np.min(q_table[x, :K - x + 1]))\n", - " for x in range(K + 1)])\n", - "σ_q = np.array([np.argmin(q_table[x, :K - x + 1])\n", - " for x in range(K + 1)])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a855f91", - "metadata": {}, - "outputs": [], - "source": [ - "fig, axes = plt.subplots(1, 2, figsize=(9.6, 3.2))\n", - "\n", - "axes[0].plot(x_values, v_star, label=\"VFI\")\n", - "axes[0].plot(x_values, v_q, '--', label=\"Q-learning\")\n", - "axes[0].set_xlabel(\"inventory\")\n", - "axes[0].set_ylabel(\"value\")\n", - "axes[0].legend()\n", - "axes[0].set_title(\"Value function\")\n", - "\n", - "axes[1].plot(x_values, σ_star, label=\"VFI\")\n", - "axes[1].plot(x_values, σ_q, '--', label=\"Q-learning\")\n", - "axes[1].set_xlabel(\"inventory\")\n", - "axes[1].set_ylabel(\"order quantity\")\n", - "axes[1].legend()\n", - "axes[1].set_title(\"Policy\")\n", - "\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "8579f07d", - "metadata": {}, - "source": [ - "### Visualizing learning over time\n", - "\n", - "The panels below show how the agent's policy evolves during training.\n", - "\n", - "Each panel simulates an inventory path using the greedy policy extracted from\n", - "the Q-table at a given training step, with the same demand sequence throughout.\n", - "\n", - "The top panel shows the optimal policy from VFI for reference." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "142d3906", - "metadata": {}, - "outputs": [], - "source": [ - "ts_length = 200\n", - "n_snaps = len(snap_steps)\n", - "fig, axes = plt.subplots(n_snaps + 1, 1, figsize=(8, 2.0 * (n_snaps + 1)),\n", - " sharex=True)\n", - "\n", - "X_init = K // 2\n", - "sim_seed = 5678\n", - "\n", - "# Optimal policy\n", - "np.random.seed(sim_seed)\n", - "X_opt = sim_inventories(ts_length, σ_star, model.p, X_init)\n", - "axes[0].plot(X_opt, alpha=0.7)\n", - "axes[0].set_ylabel(\"inventory\")\n", - "axes[0].set_title(\"Optimal (VFI)\")\n", - "axes[0].set_ylim(0, K + 2)\n", - "\n", - "# Q-learning snapshots\n", - "for i in range(n_snaps):\n", - " σ_snap = snapshots[i]\n", - " np.random.seed(sim_seed)\n", - " X = sim_inventories(ts_length, σ_snap, model.p, X_init)\n", - " axes[i + 1].plot(X, alpha=0.7)\n", - " axes[i + 1].set_ylabel(\"inventory\")\n", - " axes[i + 1].set_title(f\"Step {snap_steps[i]:,}\")\n", - " axes[i + 1].set_ylim(0, K + 2)\n", - "\n", - "axes[-1].set_xlabel(r\"$t$\")\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "id": "274e77d9", - "metadata": {}, - "source": [ - "After 10,000 steps the agent has barely explored and its policy is erratic.\n", - "\n", - "By 1,000,000 steps the learned policy begins to resemble the optimal one, and\n", - "by step 20 million the inventory dynamics are nearly indistinguishable from the\n", - "VFI solution.\n", - "\n", - "Note that the converged policy maintains lower inventory levels than in the\n", - "risk-neutral case (compare with {ref}`inventory_q`), consistent with the\n", - "mechanism discussed above: a risk-sensitive agent caps its exposure to demand\n", - "variance by holding less stock.\n", - "\n", - "## Conclusion\n", - "\n", - "We extended the inventory management problem from {ref}`inventory_q` to\n", - "incorporate risk sensitivity via the certainty equivalent operator\n", - "$\\phi^{-1}(\\mathbb{E}[\\phi(\\cdot)])$ with $\\phi(t) = \\exp(-\\gamma t)$.\n", - "\n", - "Value function iteration confirms that risk-sensitive firms order less\n", - "aggressively, preferring predictable profits over higher but more volatile\n", - "returns.\n", - "\n", - "We then showed that Q-learning can be adapted to the risk-sensitive setting by\n", - "working with the transformed Q-factor $q(x,a) =\n", - "\\mathbb{E}[\\exp(-\\gamma(\\pi + \\beta v^*))]$.\n", - "\n", - "The resulting update rule replaces addition with multiplication and max with\n", - "min, but retains the key property of model-free learning: the agent needs only\n", - "to observe states, actions, and profits." - ] - } - ], - "metadata": { - "jupytext": { - "default_lexer": "ipython3" - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/lectures/rs_inventory_q.py b/lectures/rs_inventory_q.py deleted file mode 100644 index 82a1ba74f..000000000 --- a/lectures/rs_inventory_q.py +++ /dev/null @@ -1,767 +0,0 @@ -# --- -# jupyter: -# jupytext: -# default_lexer: ipython3 -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.18.1 -# kernelspec: -# display_name: Python 3 (ipykernel) -# language: python -# name: python3 -# --- - -# %% [markdown] -# (rs_inventory_q)= -# ```{raw} jupyter -#
-# -# QuantEcon -# -#
-# ``` -# -# # Risk-Sensitive Inventory Management via Q-Learning -# -# ```{contents} Contents -# :depth: 2 -# ``` -# -# ## Introduction -# -# In {ref}`inventory_q`, we looked at an inventory management -# problem and solved it with both value function iteration and Q-learning. -# -# In this lecture, we consider a risk-sensitive variation. -# -# Injection of risk-sensitivity acknowledges the fact -# that, in incomplete markets with financial and informational frictions, firms -# typically take risk into account in their decision making. -# -# In other words, the actions of firms are not, in general, risk neutral. -# -# One natural way to handle this is to use a risk-sensitive version of the Bellman -# equation. -# -# We show how the model can be solved using value function iteration. -# -# We then investigate how risk sensitivity affects the optimal policy. - -# %% -import numpy as np -import numba -import matplotlib.pyplot as plt -from typing import NamedTuple - - -# %% [markdown] -# ## The Model -# -# The Bellman equation for the inventory management problem in {ref}`inventory_q` has the form -# -# -# $$ -# v(x) -# = \max_{a \in \Gamma(x)} \mathbb E -# \left[ -# \pi(x, a, D) -# + \beta v(h(x, a, D)) -# \right]. -# $$ -# -# Here $D$ is a random variable with distribution $\phi$. -# -# (Primitives and definitions are the same as in {ref}`inventory_q`.) -# -# The risk-sensitive version of this Bellman equation has the form -# -# $$ -# v(x) -# = \max_{a \in \Gamma(x)} -# \phi^{-1} -# \left\{ -# \mathbb E \phi -# \left[ -# \pi(x, a, D) -# + \beta v(h(x, a, D)) -# \right] -# \right\}, -# $$ -# -# where $\phi(t) = \exp(-\gamma t)$ for fixed $\gamma > 0$. -# -# Since $\phi^{-1}(y) = -\frac{1}{\gamma} \ln(y)$, the Bellman equation becomes -# -# $$ -# v(x) -# = \max_{a \in \Gamma(x)} -# \left( -# -\frac{1}{\gamma} -# \right) -# \ln -# \left\{ -# \sum_d \phi(d) \exp -# \left[ -# -\gamma \left( \pi(x, a, d) + \beta \, v(h(x, a, d)) \right) -# \right] -# \right\}. -# $$ -# -# Here $\phi(d)$ denotes the demand probability mass function, as in {ref}`inventory_q`. -# -# The parameter $\gamma$ controls the degree of risk sensitivity. -# -# As $\gamma \to 0$, the certainty equivalent reduces to the ordinary expectation and we recover the risk-neutral case. -# -# Larger $\gamma$ means more aversion to downside risk. -# -# The Bellman operator, greedy policy, and VFI algorithm all carry over from the -# risk-neutral case, with the expectation replaced by the certainty equivalent. -# -# -# -# ## Solving via Value Function Iteration -# -# ### Model specification -# -# We reuse the same model primitives as in {ref}`inventory_q`, adding $\gamma$ as a parameter. - -# %% -class RSModel(NamedTuple): - x_values: np.ndarray # Inventory values - d_values: np.ndarray # Demand values for summation - ϕ_values: np.ndarray # Demand probabilities - p: float # Demand parameter - c: float # Unit cost - κ: float # Fixed cost - β: float # Discount factor - γ: float # Risk-sensitivity parameter - - -# %% -def create_rs_inventory_model( - K: int = 20, # Max inventory - D_MAX: int = 21, # Demand upper bound for summation - p: float = 0.7, - c: float = 0.2, - κ: float = 0.8, - β: float = 0.98, - γ: float = 1.0 - ) -> RSModel: - - def demand_pdf(p, d): - return (1 - p)**d * p - - d_values = np.arange(D_MAX) - ϕ_values = demand_pdf(p, d_values) - x_values = np.arange(K + 1) - - return RSModel(x_values, d_values, ϕ_values, p, c, κ, β, γ) - - -# %% [markdown] -# ### The Bellman operator -# -# The risk-sensitive Bellman operator replaces the expected value with the certainty equivalent. -# -# For numerical stability, we use the [log-sum-exp trick](https://en.wikipedia.org/wiki/LogSumExp): given values $z_i = \pi(x, a, d_i) + \beta \, v(h(x, a, d_i))$, we compute -# -# $$ -# -\frac{1}{\gamma} \ln \sum_i \phi(d_i) \exp(-\gamma z_i) -# \;=\; -# -\frac{1}{\gamma} -# \left( -# m + \ln \sum_i \phi(d_i) \exp(-\gamma z_i - m) -# \right), -# $$ -# -# where $m = \max_i (-\gamma z_i)$. - -# %% -@numba.jit(nopython=True) -def T_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K): - new_v = np.empty(K + 1) - n_d = len(d_values) - for x in range(K + 1): - best = -np.inf - for a in range(K - x + 1): - # Compute -γ * z_i for each demand realization - exponents = np.empty(n_d) - for i in range(n_d): - d = d_values[i] - x_next = max(x - d, 0) + a - revenue = min(x, d) - cost = c * a + κ * (a > 0) - z_i = revenue - cost + β * v[x_next] - exponents[i] = -γ * z_i - # Log-sum-exp trick for numerical stability - m = np.max(exponents) - weighted_sum = 0.0 - for i in range(n_d): - weighted_sum += ϕ_values[i] * np.exp(exponents[i] - m) - val = -(1.0 / γ) * (m + np.log(weighted_sum)) - if val > best: - best = val - new_v[x] = best - return new_v - - -# %% -def T_rs(v, model): - """The risk-sensitive Bellman operator.""" - x_values, d_values, ϕ_values, p, c, κ, β, γ = model - K = len(x_values) - 1 - return T_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K) - - -# %% [markdown] -# ### Computing the greedy policy -# -# The greedy policy records the maximizing action instead of the maximized value. - -# %% -@numba.jit(nopython=True) -def get_greedy_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K): - σ = np.empty(K + 1, dtype=np.int32) - n_d = len(d_values) - for x in range(K + 1): - best = -np.inf - best_a = 0 - for a in range(K - x + 1): - exponents = np.empty(n_d) - for i in range(n_d): - d = d_values[i] - x_next = max(x - d, 0) + a - revenue = min(x, d) - cost = c * a + κ * (a > 0) - z_i = revenue - cost + β * v[x_next] - exponents[i] = -γ * z_i - m = np.max(exponents) - weighted_sum = 0.0 - for i in range(n_d): - weighted_sum += ϕ_values[i] * np.exp(exponents[i] - m) - val = -(1.0 / γ) * (m + np.log(weighted_sum)) - if val > best: - best = val - best_a = a - σ[x] = best_a - return σ - - -# %% -def get_greedy_rs(v, model): - """Get a v-greedy policy for the risk-sensitive model.""" - x_values, d_values, ϕ_values, p, c, κ, β, γ = model - K = len(x_values) - 1 - return get_greedy_rs_kernel(v, d_values, ϕ_values, c, κ, β, γ, K) - - -# %% [markdown] -# ### Value function iteration - -# %% -def solve_rs_inventory_model(v_init, model, max_iter=10_000, tol=1e-6): - v = v_init.copy() - i, error = 0, tol + 1 - - while i < max_iter and error > tol: - new_v = T_rs(v, model) - error = np.max(np.abs(new_v - v)) - i += 1 - v = new_v - - print(f"Converged in {i} iterations with error {error:.2e}") - - σ = get_greedy_rs(v, model) - return v, σ - - -# %% [markdown] -# ### Creating and solving an instance - -# %% -model = create_rs_inventory_model() -x_values = model.x_values -n_x = len(x_values) -v_init = np.zeros(n_x) - -# %% -v_star, σ_star = solve_rs_inventory_model(v_init, model) - -# %% [markdown] -# ### Effect of risk sensitivity on the optimal policy -# -# We solve the model for several values of $\gamma$ and compare the resulting policies. -# -# As we will see, a risk-sensitive firm orders less aggressively than a risk-neutral one. - -# %% -γ_values = [0.01, 1.0, 2.0] -results = {} - -for γ in γ_values: - mod = create_rs_inventory_model(γ=γ) - v, σ = solve_rs_inventory_model(np.zeros(n_x), mod) - results[γ] = (v, σ) - -# %% -fig, axes = plt.subplots(1, 2, figsize=(9.6, 3.2)) - -for γ in γ_values: - v, σ = results[γ] - axes[0].plot(x_values, v, label=f"$\\gamma = {γ}$") - axes[1].plot(x_values, σ, label=f"$\\gamma = {γ}$") - -axes[0].set_xlabel("inventory") -axes[0].set_ylabel("value") -axes[0].legend() -axes[0].set_title("Value function") - -axes[1].set_xlabel("inventory") -axes[1].set_ylabel("order quantity") -axes[1].legend() -axes[1].set_title("Policy") - -plt.tight_layout() -plt.show() - - -# %% [markdown] -# ### Simulating the optimal policy -# -# We simulate inventory dynamics under the optimal policy for the baseline $\gamma$. - -# %% -@numba.jit(nopython=True) -def sim_inventories(ts_length, σ, p, X_init=0): - """Simulate inventory dynamics under policy σ.""" - X = np.zeros(ts_length, dtype=np.int32) - X[0] = X_init - for t in range(ts_length - 1): - d = np.random.geometric(p) - 1 - X[t+1] = max(X[t] - d, 0) + σ[X[t]] - return X - - -# %% -fig, axes = plt.subplots(len(γ_values), 1, - figsize=(8, 2.0 * len(γ_values)), - sharex=True) - -ts_length = 200 -sim_seed = 5678 -K = len(x_values) - 1 - -for i, γ in enumerate(γ_values): - v, σ = results[γ] - np.random.seed(sim_seed) - X = sim_inventories(ts_length, σ, model.p, X_init=K // 2) - axes[i].plot(X, alpha=0.7) - axes[i].set_ylabel("inventory") - axes[i].set_title(f"$\\gamma = {γ}$") - axes[i].set_ylim(0, K + 2) - -axes[-1].set_xlabel(r"$t$") -plt.tight_layout() -plt.show() - - -# %% [markdown] -# ## Interpreting the Outcomes -# -# The plots above show that a more risk-sensitive firm (larger $\gamma$) orders -# less inventory and maintains lower stock levels. -# -# At first glance this may seem surprising: wouldn't holding more inventory -# reduce variance by ensuring the firm can always meet demand? -# -# The key is to identify where the randomness in profits actually comes from. -# -# Recall that per-period profit is $\pi(x, a, d) = \min(x, d) - ca - \kappa -# \mathbf{1}\{a > 0\}$. -# -# The ordering cost $ca + \kappa \mathbf{1}\{a > 0\}$ is **deterministic** — it -# is chosen before the demand shock is realized. -# -# So higher ordering shifts the level of profits down but does not affect their -# variance. -# -# The variance comes from **revenue**: $\min(x, D)$. -# -# When inventory $x$ is high, $\min(x, D) \approx D$ for most demand -# realizations — revenue inherits the full variance of demand. -# -# When inventory $x$ is low, $\min(x, D) \approx x$ for most realizations — -# revenue is nearly deterministic, capped at the inventory level. -# -# A risk-sensitive agent therefore prefers lower inventory because it **caps the -# randomness of revenue**. -# -# The agent accepts lower expected sales in exchange for more predictable profits. -# -# There is also a continuation value channel: next-period inventory $\max(x - D, -# 0) + a$ varies with $D$, and higher $x$ means $x - D$ tracks $D$ more -# closely, propagating that variance forward through $v$. -# -# -# ## Q-Learning -# -# We now ask whether the optimal policy can be learned without knowledge of the -# model, as we did in the risk-neutral case in {ref}`inventory_q`. -# -# ### The Q-factor -# -# The first step is to define the Q-factor in a way that is compatible with the -# risk-sensitive Bellman equation. -# -# We define -# -# $$ -# q(x, a) := \mathbb E -# \left[ -# \exp\!\left( -# -\gamma \left( \pi(x, a, D) + \beta \, v^*(h(x, a, D)) \right) -# \right) -# \right]. -# $$ -# -# In words, $q(x, a)$ applies the risk-sensitivity transformation $\phi(t) = -# \exp(-\gamma t)$ inside the expectation, evaluated at the return from taking -# action $a$ in state $x$ and following the optimal policy thereafter. -# -# ### Deriving the Q-factor Bellman equation -# -# Our goal is to obtain a fixed point equation in $q$ alone, eliminating $v^*$. -# -# **Step 1.** Express $v^*$ in terms of $q$. -# -# The risk-sensitive Bellman equation says $v^*(x) = \max_{a \in \Gamma(x)} -# \phi^{-1}(q(x, a))$. -# -# Since $\phi^{-1}(y) = -\frac{1}{\gamma} \ln(y)$ is a **decreasing** function, -# the maximum over $a$ of $\phi^{-1}(q(x, a))$ corresponds to the **minimum** -# over $a$ of $q(x, a)$: -# -# $$ -# v^*(x) -# = \phi^{-1}\!\left(\min_{a \in \Gamma(x)} q(x, a)\right) -# = -\frac{1}{\gamma} \ln\!\left(\min_{a \in \Gamma(x)} q(x, a)\right). -# $$ -# -# Equivalently, -# -# $$ -# \exp(-\gamma \, v^*(x)) = \min_{a \in \Gamma(x)} q(x, a). -# $$ -# -# **Step 2.** Substitute back into the definition of $q$ to eliminate $v^*$. -# -# Expanding the exponential in the definition of $q$, -# -# $$ -# q(x, a) -# = \mathbb E -# \left[ -# \exp(-\gamma \, \pi(x, a, D)) -# \;\cdot\; -# \exp\!\left(-\gamma \beta \, v^*(x')\right) -# \right] -# $$ -# -# where $x' = h(x, a, D)$. -# -# From Step 1, $\exp(-\gamma \, v^*(x')) = \min_{a' \in \Gamma(x')} q(x', a')$, -# so $\exp(-\gamma \beta \, v^*(x')) = \left[\min_{a' \in \Gamma(x')} q(x', -# a')\right]^\beta$. -# -# Substituting, -# -# $$ -# q(x, a) -# = \mathbb E -# \left[ -# \exp(-\gamma \, \pi(x, a, D)) -# \;\cdot\; -# \left(\min_{a' \in \Gamma(x')} q(x', a')\right)^\beta -# \right]. -# $$ -# -# This is a fixed point equation in $q$ alone — $v^*$ has been eliminated. -# -# ### The Q-learning update rule -# -# As in the risk-neutral case, we approximate the fixed point using stochastic -# approximation. -# -# At each step, the agent is in state $x$, takes action $a$, observes profit -# $R_{t+1} = \pi(x, a, D_{t+1})$ and next state $X_{t+1} = h(x, a, D_{t+1})$, -# and updates -# -# $$ -# q_{t+1}(x, a) -# = (1 - \alpha_t) \, q_t(x, a) -# + \alpha_t -# \left[ -# \exp(-\gamma \, R_{t+1}) -# \;\cdot\; -# \left(\min_{a' \in \Gamma(X_{t+1})} q_t(X_{t+1}, a')\right)^\beta -# \right]. -# $$ -# -# The term in brackets is a single-sample estimate of the right-hand side of the -# Q-factor Bellman equation. -# -# The update blends the current estimate with this fresh sample, just as in -# standard Q-learning. -# -# Notice several differences from the risk-neutral case: -# -# - The Q-values are **positive** (expectations of exponentials) rather than signed. -# - The optimal policy is $\sigma(x) = \arg\min_a q(x, a)$ — we **minimize** -# rather than maximize, because $\phi^{-1}$ is decreasing. -# - The observed profit enters through $\exp(-\gamma R_{t+1})$ rather than -# additively. -# - The continuation value enters as a **power** $(\min_{a'} q_t)^\beta$ rather -# than a scaled sum $\beta \cdot \max_{a'} q_t$. -# -# As before, the agent needs only to observe $x$, $a$, $R_{t+1}$, and -# $X_{t+1}$ — no model knowledge is required. -# -# ### Implementation plan -# -# Our implementation follows the same structure as the risk-neutral Q-learning in -# {ref}`inventory_q`, with the modifications above: -# -# 1. **Initialize** the Q-table $q$ to ones (since Q-values are positive) and -# visit counts $n$ to zeros. -# 2. **At each step:** -# - Draw demand $D_{t+1}$ and compute observed profit $R_{t+1}$ and next state -# $X_{t+1}$. -# - Compute $\min_{a'} q_t(X_{t+1}, a')$ over feasible actions (this is a -# scalar for the update target, and the $\arg\min$ action is used by the -# $\varepsilon$-greedy behavior policy). -# - Update $q_t(x, a)$ using the rule above, with learning rate -# $\alpha_t = 1 / n_t(x, a)^{0.51}$. -# - Choose the next action via $\varepsilon$-greedy: with probability -# $\varepsilon$ pick a random feasible action, otherwise pick the -# $\arg\min$ action. -# - Decay $\varepsilon$. -# 3. **Extract the greedy policy** from the final Q-table via -# $\sigma(x) = \arg\min_{a \in \Gamma(x)} q(x, a)$. -# 4. **Compare** the learned policy against the VFI solution. -# -# ### Implementation -# -# We first define a helper to extract the greedy policy from the Q-table. -# -# Since the optimal policy minimizes $q$, we use $\arg\min$ rather than $\arg\max$. - -# %% -@numba.jit(nopython=True) -def greedy_policy_from_q_rs(q, K): - """Extract greedy policy from risk-sensitive Q-table (argmin).""" - σ = np.empty(K + 1, dtype=np.int32) - for x in range(K + 1): - best_val = np.inf - best_a = 0 - for a in range(K - x + 1): - if q[x, a] < best_val: - best_val = q[x, a] - best_a = a - σ[x] = best_a - return σ - - -# %% [markdown] -# The Q-learning loop mirrors the risk-neutral version, with the key changes: -# Q-table initialized to ones, the update target uses $\exp(-\gamma R_{t+1}) -# \cdot (\min_{a'} q_t)^\beta$, and the behavior policy follows the $\arg\min$. - -# %% -@numba.jit(nopython=True) -def q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init, - ε_init, ε_min, ε_decay, snapshot_steps): - q = np.ones((K + 1, K + 1)) # positive Q-values, initialized to 1 - n = np.zeros((K + 1, K + 1)) # visit counts for learning rate - ε = ε_init - - n_snaps = len(snapshot_steps) - snapshots = np.zeros((n_snaps, K + 1), dtype=np.int32) - snap_idx = 0 - - # Initialize state and action - x = X_init - a = np.random.randint(0, K - x + 1) - - for t in range(n_steps): - # Record policy snapshot if needed - if snap_idx < n_snaps and t == snapshot_steps[snap_idx]: - snapshots[snap_idx] = greedy_policy_from_q_rs(q, K) - snap_idx += 1 - - # === Draw D_{t+1} and observe outcome === - d = np.random.geometric(p) - 1 - reward = min(x, d) - c * a - κ * (a > 0) - x_next = max(x - d, 0) + a - - # === Min over next state (scalar value for update target) === - # Also record the argmin action for use by the behavior policy. - best_next = np.inf - a_next = 0 - for aa in range(K - x_next + 1): - if q[x_next, aa] < best_next: - best_next = q[x_next, aa] - a_next = aa - - # === Risk-sensitive Q-learning update === - target = np.exp(-γ * reward) * best_next ** β - n[x, a] += 1 - α = 1.0 / n[x, a] ** 0.51 - q[x, a] = (1 - α) * q[x, a] + α * target - - # === Behavior policy: ε-greedy (uses a_next, the argmin action) === - x = x_next - if np.random.random() < ε: - a = np.random.randint(0, K - x + 1) - else: - a = a_next - ε = max(ε_min, ε * ε_decay) - - return q, snapshots - - -# %% [markdown] -# The wrapper function unpacks the model and provides default hyperparameters. - -# %% -def q_learning_rs(model, n_steps=20_000_000, X_init=0, - ε_init=1.0, ε_min=0.01, ε_decay=0.999999, - snapshot_steps=None): - x_values, d_values, ϕ_values, p, c, κ, β, γ = model - K = len(x_values) - 1 - if snapshot_steps is None: - snapshot_steps = np.array([], dtype=np.int64) - return q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init, - ε_init, ε_min, ε_decay, snapshot_steps) - - -# %% [markdown] -# ### Running Q-learning -# -# We run 20 million steps and take policy snapshots at steps 10,000, 1,000,000, and at the end. - -# %% -np.random.seed(1234) -snap_steps = np.array([10_000, 1_000_000, 19_999_999], dtype=np.int64) -q_table, snapshots = q_learning_rs(model, snapshot_steps=snap_steps) - -# %% [markdown] -# ### Comparing with the exact solution -# -# We extract the value function and policy from the final Q-table. -# -# Since Q-values represent $\mathbb{E}[\exp(-\gamma(\cdots))]$, we recover the -# value function via $v_Q(x) = -\frac{1}{\gamma} \ln(\min_{a} q(x, a))$ and the -# policy via $\sigma_Q(x) = \arg\min_a q(x, a)$. - -# %% -K = len(x_values) - 1 -γ_base = model.γ -# restrict to feasible actions a ∈ {0, ..., K-x} -v_q = np.array([-(1/γ_base) * np.log(np.min(q_table[x, :K - x + 1])) - for x in range(K + 1)]) -σ_q = np.array([np.argmin(q_table[x, :K - x + 1]) - for x in range(K + 1)]) - -# %% -fig, axes = plt.subplots(1, 2, figsize=(9.6, 3.2)) - -axes[0].plot(x_values, v_star, label="VFI") -axes[0].plot(x_values, v_q, '--', label="Q-learning") -axes[0].set_xlabel("inventory") -axes[0].set_ylabel("value") -axes[0].legend() -axes[0].set_title("Value function") - -axes[1].plot(x_values, σ_star, label="VFI") -axes[1].plot(x_values, σ_q, '--', label="Q-learning") -axes[1].set_xlabel("inventory") -axes[1].set_ylabel("order quantity") -axes[1].legend() -axes[1].set_title("Policy") - -plt.tight_layout() -plt.show() - -# %% [markdown] -# ### Visualizing learning over time -# -# The panels below show how the agent's policy evolves during training. -# -# Each panel simulates an inventory path using the greedy policy extracted from -# the Q-table at a given training step, with the same demand sequence throughout. -# -# The top panel shows the optimal policy from VFI for reference. - -# %% -ts_length = 200 -n_snaps = len(snap_steps) -fig, axes = plt.subplots(n_snaps + 1, 1, figsize=(8, 2.0 * (n_snaps + 1)), - sharex=True) - -X_init = K // 2 -sim_seed = 5678 - -# Optimal policy -np.random.seed(sim_seed) -X_opt = sim_inventories(ts_length, σ_star, model.p, X_init) -axes[0].plot(X_opt, alpha=0.7) -axes[0].set_ylabel("inventory") -axes[0].set_title("Optimal (VFI)") -axes[0].set_ylim(0, K + 2) - -# Q-learning snapshots -for i in range(n_snaps): - σ_snap = snapshots[i] - np.random.seed(sim_seed) - X = sim_inventories(ts_length, σ_snap, model.p, X_init) - axes[i + 1].plot(X, alpha=0.7) - axes[i + 1].set_ylabel("inventory") - axes[i + 1].set_title(f"Step {snap_steps[i]:,}") - axes[i + 1].set_ylim(0, K + 2) - -axes[-1].set_xlabel(r"$t$") -plt.tight_layout() -plt.show() - -# %% [markdown] -# After 10,000 steps the agent has barely explored and its policy is erratic. -# -# By 1,000,000 steps the learned policy begins to resemble the optimal one, and -# by step 20 million the inventory dynamics are nearly indistinguishable from the -# VFI solution. -# -# Note that the converged policy maintains lower inventory levels than in the -# risk-neutral case (compare with {ref}`inventory_q`), consistent with the -# mechanism discussed above: a risk-sensitive agent caps its exposure to demand -# variance by holding less stock. -# -# ## Conclusion -# -# We extended the inventory management problem from {ref}`inventory_q` to -# incorporate risk sensitivity via the certainty equivalent operator -# $\phi^{-1}(\mathbb{E}[\phi(\cdot)])$ with $\phi(t) = \exp(-\gamma t)$. -# -# Value function iteration confirms that risk-sensitive firms order less -# aggressively, preferring predictable profits over higher but more volatile -# returns. -# -# We then showed that Q-learning can be adapted to the risk-sensitive setting by -# working with the transformed Q-factor $q(x,a) = -# \mathbb{E}[\exp(-\gamma(\pi + \beta v^*))]$. -# -# The resulting update rule replaces addition with multiplication and max with -# min, but retains the key property of model-free learning: the agent needs only -# to observe states, actions, and profits. From 78a43a97ed70c94fab932f09b470004ddfc44af2 Mon Sep 17 00:00:00 2001 From: John Stachurski Date: Mon, 9 Mar 2026 17:10:26 +1100 Subject: [PATCH 4/8] Fix cross-references: use {doc} instead of {ref} for lecture links The {ref} directive requires a label placed before a heading to resolve a title. The inventory_q label is before a raw block, so {ref} fails. Using {doc} links to the document directly. Co-Authored-By: Claude Opus 4.6 --- lectures/rs_inventory_q.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lectures/rs_inventory_q.md b/lectures/rs_inventory_q.md index 6a368656b..157495109 100644 --- a/lectures/rs_inventory_q.md +++ b/lectures/rs_inventory_q.md @@ -28,7 +28,7 @@ kernelspec: ## Introduction -In {ref}`inventory_q`, we looked at an inventory management +In {doc}`inventory_q`, we looked at an inventory management problem and solved it with both value function iteration and Q-learning. In this lecture, we consider a risk-sensitive variation. @@ -58,7 +58,7 @@ from typing import NamedTuple ## The Model -The Bellman equation for the inventory management problem in {ref}`inventory_q` has the form +The Bellman equation for the inventory management problem in {doc}`inventory_q` has the form $$ @@ -72,7 +72,7 @@ $$ Here $D$ is a random variable with distribution $\phi$. -(Primitives and definitions are the same as in {ref}`inventory_q`.) +(Primitives and definitions are the same as in {doc}`inventory_q`.) The risk-sensitive version of this Bellman equation has the form @@ -108,7 +108,7 @@ $$ \right\}. $$ -Here $\phi(d)$ denotes the demand probability mass function, as in {ref}`inventory_q`. +Here $\phi(d)$ denotes the demand probability mass function, as in {doc}`inventory_q`. The parameter $\gamma$ controls the degree of risk sensitivity. @@ -125,7 +125,7 @@ risk-neutral case, with the expectation replaced by the certainty equivalent. ### Model specification -We reuse the same model primitives as in {ref}`inventory_q`, adding $\gamma$ as a parameter. +We reuse the same model primitives as in {doc}`inventory_q`, adding $\gamma$ as a parameter. ```{code-cell} ipython3 class RSModel(NamedTuple): @@ -405,7 +405,7 @@ closely, propagating that variance forward through $v$. ## Q-Learning We now ask whether the optimal policy can be learned without knowledge of the -model, as we did in the risk-neutral case in {ref}`inventory_q`. +model, as we did in the risk-neutral case in {doc}`inventory_q`. ### The Q-factor @@ -528,7 +528,7 @@ $X_{t+1}$ — no model knowledge is required. ### Implementation plan Our implementation follows the same structure as the risk-neutral Q-learning in -{ref}`inventory_q`, with the modifications above: +{doc}`inventory_q`, with the modifications above: 1. **Initialize** the Q-table $q$ to ones (since Q-values are positive) and visit counts $n$ to zeros. @@ -738,13 +738,13 @@ by step 20 million the inventory dynamics are nearly indistinguishable from the VFI solution. Note that the converged policy maintains lower inventory levels than in the -risk-neutral case (compare with {ref}`inventory_q`), consistent with the +risk-neutral case (compare with {doc}`inventory_q`), consistent with the mechanism discussed above: a risk-sensitive agent caps its exposure to demand variance by holding less stock. ## Conclusion -We extended the inventory management problem from {ref}`inventory_q` to +We extended the inventory management problem from {doc}`inventory_q` to incorporate risk sensitivity via the certainty equivalent operator $\phi^{-1}(\mathbb{E}[\phi(\cdot)])$ with $\phi(t) = \exp(-\gamma t)$. From 29762cd1acdffd05c6e4d6a3a2c2c2c62fb9f104 Mon Sep 17 00:00:00 2001 From: Matt McKay Date: Tue, 10 Mar 2026 08:59:16 +1100 Subject: [PATCH 5/8] Use \argmin and \argmax macros from _config.yml --- lectures/rs_inventory_q.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lectures/rs_inventory_q.md b/lectures/rs_inventory_q.md index 157495109..58e6b1b9d 100644 --- a/lectures/rs_inventory_q.md +++ b/lectures/rs_inventory_q.md @@ -515,7 +515,7 @@ standard Q-learning. Notice several differences from the risk-neutral case: - The Q-values are **positive** (expectations of exponentials) rather than signed. -- The optimal policy is $\sigma(x) = \arg\min_a q(x, a)$ — we **minimize** +- The optimal policy is $\sigma(x) = \argmin_a q(x, a)$ — we **minimize** rather than maximize, because $\phi^{-1}$ is decreasing. - The observed profit enters through $\exp(-\gamma R_{t+1})$ rather than additively. @@ -536,23 +536,23 @@ Our implementation follows the same structure as the risk-neutral Q-learning in - Draw demand $D_{t+1}$ and compute observed profit $R_{t+1}$ and next state $X_{t+1}$. - Compute $\min_{a'} q_t(X_{t+1}, a')$ over feasible actions (this is a - scalar for the update target, and the $\arg\min$ action is used by the + scalar for the update target, and the $\argmin$ action is used by the $\varepsilon$-greedy behavior policy). - Update $q_t(x, a)$ using the rule above, with learning rate $\alpha_t = 1 / n_t(x, a)^{0.51}$. - Choose the next action via $\varepsilon$-greedy: with probability $\varepsilon$ pick a random feasible action, otherwise pick the - $\arg\min$ action. + $\argmin$ action. - Decay $\varepsilon$. 3. **Extract the greedy policy** from the final Q-table via - $\sigma(x) = \arg\min_{a \in \Gamma(x)} q(x, a)$. + $\sigma(x) = \argmin_{a \in \Gamma(x)} q(x, a)$. 4. **Compare** the learned policy against the VFI solution. ### Implementation We first define a helper to extract the greedy policy from the Q-table. -Since the optimal policy minimizes $q$, we use $\arg\min$ rather than $\arg\max$. +Since the optimal policy minimizes $q$, we use $\argmin$ rather than $\argmax$. ```{code-cell} ipython3 @numba.jit(nopython=True) @@ -572,7 +572,7 @@ def greedy_policy_from_q_rs(q, K): The Q-learning loop mirrors the risk-neutral version, with the key changes: Q-table initialized to ones, the update target uses $\exp(-\gamma R_{t+1}) -\cdot (\min_{a'} q_t)^\beta$, and the behavior policy follows the $\arg\min$. +\cdot (\min_{a'} q_t)^\beta$, and the behavior policy follows the $\argmin$. ```{code-cell} ipython3 @numba.jit(nopython=True) @@ -657,7 +657,7 @@ We extract the value function and policy from the final Q-table. Since Q-values represent $\mathbb{E}[\exp(-\gamma(\cdots))]$, we recover the value function via $v_Q(x) = -\frac{1}{\gamma} \ln(\min_{a} q(x, a))$ and the -policy via $\sigma_Q(x) = \arg\min_a q(x, a)$. +policy via $\sigma_Q(x) = \argmin_a q(x, a)$. ```{code-cell} ipython3 K = len(x_values) - 1 From 93ca801d9149340887bd94d38166ae72875bdc2b Mon Sep 17 00:00:00 2001 From: John Stachurski Date: Tue, 10 Mar 2026 10:00:07 +1100 Subject: [PATCH 6/8] =?UTF-8?q?Rename=20risk-sensitivity=20function=20from?= =?UTF-8?q?=20=CF=86=20to=20=CF=88=20to=20avoid=20notation=20conflict?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The symbol φ was overloaded: used for both the demand PMF and the risk-sensitivity transformation. Now ψ(t) = exp(-γt) is the risk transformation and φ(d) remains the demand PMF, consistent with inventory_q.md. Co-Authored-By: Claude Opus 4.6 --- lectures/rs_inventory_q.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/lectures/rs_inventory_q.md b/lectures/rs_inventory_q.md index 58e6b1b9d..dd77bd16a 100644 --- a/lectures/rs_inventory_q.md +++ b/lectures/rs_inventory_q.md @@ -79,9 +79,9 @@ The risk-sensitive version of this Bellman equation has the form $$ v(x) = \max_{a \in \Gamma(x)} - \phi^{-1} + \psi^{-1} \left\{ - \mathbb E \phi + \mathbb E \psi \left[ \pi(x, a, D) + \beta v(h(x, a, D)) @@ -89,9 +89,9 @@ $$ \right\}, $$ -where $\phi(t) = \exp(-\gamma t)$ for fixed $\gamma > 0$. +where $\psi(t) = \exp(-\gamma t)$ for fixed $\gamma > 0$. -Since $\phi^{-1}(y) = -\frac{1}{\gamma} \ln(y)$, the Bellman equation becomes +Since $\psi^{-1}(y) = -\frac{1}{\gamma} \ln(y)$, the Bellman equation becomes $$ v(x) @@ -423,7 +423,7 @@ $$ \right]. $$ -In words, $q(x, a)$ applies the risk-sensitivity transformation $\phi(t) = +In words, $q(x, a)$ applies the risk-sensitivity transformation $\psi(t) = \exp(-\gamma t)$ inside the expectation, evaluated at the return from taking action $a$ in state $x$ and following the optimal policy thereafter. @@ -434,15 +434,15 @@ Our goal is to obtain a fixed point equation in $q$ alone, eliminating $v^*$. **Step 1.** Express $v^*$ in terms of $q$. The risk-sensitive Bellman equation says $v^*(x) = \max_{a \in \Gamma(x)} -\phi^{-1}(q(x, a))$. +\psi^{-1}(q(x, a))$. -Since $\phi^{-1}(y) = -\frac{1}{\gamma} \ln(y)$ is a **decreasing** function, -the maximum over $a$ of $\phi^{-1}(q(x, a))$ corresponds to the **minimum** +Since $\psi^{-1}(y) = -\frac{1}{\gamma} \ln(y)$ is a **decreasing** function, +the maximum over $a$ of $\psi^{-1}(q(x, a))$ corresponds to the **minimum** over $a$ of $q(x, a)$: $$ v^*(x) - = \phi^{-1}\!\left(\min_{a \in \Gamma(x)} q(x, a)\right) + = \psi^{-1}\!\left(\min_{a \in \Gamma(x)} q(x, a)\right) = -\frac{1}{\gamma} \ln\!\left(\min_{a \in \Gamma(x)} q(x, a)\right). $$ @@ -516,7 +516,7 @@ Notice several differences from the risk-neutral case: - The Q-values are **positive** (expectations of exponentials) rather than signed. - The optimal policy is $\sigma(x) = \argmin_a q(x, a)$ — we **minimize** - rather than maximize, because $\phi^{-1}$ is decreasing. + rather than maximize, because $\psi^{-1}$ is decreasing. - The observed profit enters through $\exp(-\gamma R_{t+1})$ rather than additively. - The continuation value enters as a **power** $(\min_{a'} q_t)^\beta$ rather @@ -746,7 +746,7 @@ variance by holding less stock. We extended the inventory management problem from {doc}`inventory_q` to incorporate risk sensitivity via the certainty equivalent operator -$\phi^{-1}(\mathbb{E}[\phi(\cdot)])$ with $\phi(t) = \exp(-\gamma t)$. +$\psi^{-1}(\mathbb{E}[\psi(\cdot)])$ with $\psi(t) = \exp(-\gamma t)$. Value function iteration confirms that risk-sensitive firms order less aggressively, preferring predictable profits over higher but more volatile From 1707fc6b2e4a68b0953a18b953e7c315275cf36b Mon Sep 17 00:00:00 2001 From: John Stachurski Date: Tue, 10 Mar 2026 11:04:15 +1100 Subject: [PATCH 7/8] Add forward ref from inventory_q and fix minor grammar - Add cross-reference from inventory_q.md to rs_inventory_q.md - Fix missing comma after introductory phrase Co-Authored-By: Claude Opus 4.6 --- lectures/inventory_q.md | 2 ++ lectures/rs_inventory_q.md | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/lectures/inventory_q.md b/lectures/inventory_q.md index 39740eb6b..56c4cde2c 100644 --- a/lectures/inventory_q.md +++ b/lectures/inventory_q.md @@ -56,6 +56,8 @@ The lecture proceeds as follows: 2. We introduce Q-factors and derive the Q-factor Bellman equation. 3. We implement Q-learning and show the learned policy converges to the optimal one. +A risk-sensitive extension of this model is studied in {doc}`rs_inventory_q`. + We will use the following imports: ```{code-cell} ipython3 diff --git a/lectures/rs_inventory_q.md b/lectures/rs_inventory_q.md index dd77bd16a..da2f1d176 100644 --- a/lectures/rs_inventory_q.md +++ b/lectures/rs_inventory_q.md @@ -731,7 +731,7 @@ plt.tight_layout() plt.show() ``` -After 10,000 steps the agent has barely explored and its policy is erratic. +After 10,000 steps, the agent has barely explored and its policy is erratic. By 1,000,000 steps the learned policy begins to resemble the optimal one, and by step 20 million the inventory dynamics are nearly indistinguishable from the From fdd3a7186122f92198b6fa1b3c52702a601358f6 Mon Sep 17 00:00:00 2001 From: John Stachurski Date: Tue, 10 Mar 2026 11:50:52 +1100 Subject: [PATCH 8/8] Move random seeds inside jitted functions for reproducibility Numba JIT functions use their own RNG state, so np.random.seed() called outside a jitted function has no effect on random draws inside it. Moved seeds into sim_inventories and q_learning kernels as parameters. Fixes issue noted by @HumphreyYang. Co-Authored-By: Claude Opus 4.6 --- lectures/inventory_q.md | 17 ++++++++--------- lectures/rs_inventory_q.md | 20 +++++++++----------- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/lectures/inventory_q.md b/lectures/inventory_q.md index 56c4cde2c..146921bf8 100644 --- a/lectures/inventory_q.md +++ b/lectures/inventory_q.md @@ -348,8 +348,9 @@ At each step, we draw a demand shock from the geometric distribution and update ```{code-cell} ipython3 @numba.jit(nopython=True) -def sim_inventories(ts_length, σ, p, X_init=0): +def sim_inventories(ts_length, σ, p, X_init=0, seed=0): """Simulate inventory dynamics under policy σ.""" + np.random.seed(seed) X = np.zeros(ts_length, dtype=np.int32) X[0] = X_init for t in range(ts_length - 1): @@ -586,7 +587,8 @@ At specified step counts (given by `snapshot_steps`), we record the current gree ```{code-cell} ipython3 @numba.jit(nopython=True) def q_learning_kernel(K, p, c, κ, β, n_steps, X_init, - ε_init, ε_min, ε_decay, snapshot_steps): + ε_init, ε_min, ε_decay, snapshot_steps, seed): + np.random.seed(seed) q = np.zeros((K + 1, K + 1)) n = np.zeros((K + 1, K + 1)) # visit counts for learning rate ε = ε_init @@ -640,13 +642,13 @@ The wrapper function unpacks the model and provides default hyperparameters. ```{code-cell} ipython3 def q_learning(model, n_steps=20_000_000, X_init=0, ε_init=1.0, ε_min=0.01, ε_decay=0.999999, - snapshot_steps=None): + snapshot_steps=None, seed=1234): x_values, d_values, ϕ_values, p, c, κ, β = model K = len(x_values) - 1 if snapshot_steps is None: snapshot_steps = np.array([], dtype=np.int64) return q_learning_kernel(K, p, c, κ, β, n_steps, X_init, - ε_init, ε_min, ε_decay, snapshot_steps) + ε_init, ε_min, ε_decay, snapshot_steps, seed) ``` ### Running Q-learning @@ -654,7 +656,6 @@ def q_learning(model, n_steps=20_000_000, X_init=0, We run 20 million steps and take policy snapshots at steps 10,000, 1,000,000, and at the end. ```{code-cell} ipython3 -np.random.seed(1234) snap_steps = np.array([10_000, 1_000_000, 19_999_999], dtype=np.int64) q, snapshots = q_learning(model, snapshot_steps=snap_steps) ``` @@ -723,8 +724,7 @@ X_init = K // 2 sim_seed = 5678 # Optimal policy -np.random.seed(sim_seed) -X_opt = sim_inventories(ts_length, σ_star, p, X_init) +X_opt = sim_inventories(ts_length, σ_star, p, X_init, seed=sim_seed) axes[0].plot(X_opt, alpha=0.7) axes[0].set_ylabel("inventory") axes[0].set_title("Optimal (VFI)") @@ -733,8 +733,7 @@ axes[0].set_ylim(0, K + 2) # Q-learning snapshots for i in range(n_snaps): σ_snap = snapshots[i] - np.random.seed(sim_seed) - X = sim_inventories(ts_length, σ_snap, p, X_init) + X = sim_inventories(ts_length, σ_snap, p, X_init, seed=sim_seed) axes[i + 1].plot(X, alpha=0.7) axes[i + 1].set_ylabel("inventory") axes[i + 1].set_title(f"Step {snap_steps[i]:,}") diff --git a/lectures/rs_inventory_q.md b/lectures/rs_inventory_q.md index da2f1d176..cd81e6bad 100644 --- a/lectures/rs_inventory_q.md +++ b/lectures/rs_inventory_q.md @@ -332,8 +332,9 @@ We simulate inventory dynamics under the optimal policy for the baseline $\gamma ```{code-cell} ipython3 @numba.jit(nopython=True) -def sim_inventories(ts_length, σ, p, X_init=0): +def sim_inventories(ts_length, σ, p, X_init=0, seed=0): """Simulate inventory dynamics under policy σ.""" + np.random.seed(seed) X = np.zeros(ts_length, dtype=np.int32) X[0] = X_init for t in range(ts_length - 1): @@ -353,8 +354,7 @@ K = len(x_values) - 1 for i, γ in enumerate(γ_values): v, σ = results[γ] - np.random.seed(sim_seed) - X = sim_inventories(ts_length, σ, model.p, X_init=K // 2) + X = sim_inventories(ts_length, σ, model.p, X_init=K // 2, seed=sim_seed) axes[i].plot(X, alpha=0.7) axes[i].set_ylabel("inventory") axes[i].set_title(f"$\\gamma = {γ}$") @@ -577,7 +577,8 @@ Q-table initialized to ones, the update target uses $\exp(-\gamma R_{t+1}) ```{code-cell} ipython3 @numba.jit(nopython=True) def q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init, - ε_init, ε_min, ε_decay, snapshot_steps): + ε_init, ε_min, ε_decay, snapshot_steps, seed): + np.random.seed(seed) q = np.ones((K + 1, K + 1)) # positive Q-values, initialized to 1 n = np.zeros((K + 1, K + 1)) # visit counts for learning rate ε = ε_init @@ -632,13 +633,13 @@ The wrapper function unpacks the model and provides default hyperparameters. ```{code-cell} ipython3 def q_learning_rs(model, n_steps=20_000_000, X_init=0, ε_init=1.0, ε_min=0.01, ε_decay=0.999999, - snapshot_steps=None): + snapshot_steps=None, seed=1234): x_values, d_values, ϕ_values, p, c, κ, β, γ = model K = len(x_values) - 1 if snapshot_steps is None: snapshot_steps = np.array([], dtype=np.int64) return q_learning_rs_kernel(K, p, c, κ, β, γ, n_steps, X_init, - ε_init, ε_min, ε_decay, snapshot_steps) + ε_init, ε_min, ε_decay, snapshot_steps, seed) ``` ### Running Q-learning @@ -646,7 +647,6 @@ def q_learning_rs(model, n_steps=20_000_000, X_init=0, We run 20 million steps and take policy snapshots at steps 10,000, 1,000,000, and at the end. ```{code-cell} ipython3 -np.random.seed(1234) snap_steps = np.array([10_000, 1_000_000, 19_999_999], dtype=np.int64) q_table, snapshots = q_learning_rs(model, snapshot_steps=snap_steps) ``` @@ -709,8 +709,7 @@ X_init = K // 2 sim_seed = 5678 # Optimal policy -np.random.seed(sim_seed) -X_opt = sim_inventories(ts_length, σ_star, model.p, X_init) +X_opt = sim_inventories(ts_length, σ_star, model.p, X_init, seed=sim_seed) axes[0].plot(X_opt, alpha=0.7) axes[0].set_ylabel("inventory") axes[0].set_title("Optimal (VFI)") @@ -719,8 +718,7 @@ axes[0].set_ylim(0, K + 2) # Q-learning snapshots for i in range(n_snaps): σ_snap = snapshots[i] - np.random.seed(sim_seed) - X = sim_inventories(ts_length, σ_snap, model.p, X_init) + X = sim_inventories(ts_length, σ_snap, model.p, X_init, seed=sim_seed) axes[i + 1].plot(X, alpha=0.7) axes[i + 1].set_ylabel("inventory") axes[i + 1].set_title(f"Step {snap_steps[i]:,}")