From d40e0469bc6a985b1bb226e4171f215c50ba88b6 Mon Sep 17 00:00:00 2001
From: Damian Stewart <ot@damianstewart.com>
Date: Thu, 15 Jan 2026 11:05:29 +0100
Subject: [PATCH 01/11] wip: docker dev

---
 docker/cc-vec-bot.docker | 58 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 docker/cc-vec-bot.docker

diff --git a/docker/cc-vec-bot.docker b/docker/cc-vec-bot.docker
new file mode 100644
index 0000000..b4e5f86
--- /dev/null
+++ b/docker/cc-vec-bot.docker
@@ -0,0 +1,58 @@
+FROM llamastack/distribution-starter:0.4.1
+LABEL maintainer="damian@commoncrawl.org"
+
+USER root
+
+# Install minimal dependencies required by the Ollama install script
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends curl ca-certificates gnupg \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Ollama
+RUN curl -fsSL https://ollama.ai/install.sh | sh
+
+ENV PATH="/usr/local/bin:${PATH}"
+
+# Default model to pull on startup (tinyllama ~637MB, smallest practical LLM)
+# For even smaller (embeddings only): all-minilm (~45MB)
+# For production: llama3.2:3b (~2GB)
+ENV OLLAMA_MODEL="tinyllama"
+
+# Create entrypoint script that starts ollama and pulls model on first run
+RUN cat <<'EOF' > /entrypoint.sh
+#!/bin/bash
+set -e
+
+# Start ollama server in background
+ollama serve &
+OLLAMA_PID=$!
+
+# Wait for ollama to be ready
+echo "Waiting for Ollama to start..."
+for i in {1..30}; do
+    if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
+        echo "Ollama is ready"
+        break
+    fi
+    sleep 1
+done
+
+# Pull model if not already present
+if [ -n "$OLLAMA_MODEL" ]; then
+    echo "Ensuring model $OLLAMA_MODEL is available..."
+    ollama pull "$OLLAMA_MODEL"
+fi
+
+# If a command was passed, run it; otherwise wait on ollama
+if [ $# -gt 0 ]; then
+    exec "$@"
+else
+    wait $OLLAMA_PID
+fi
+EOF
+RUN chmod +x /entrypoint.sh
+
+EXPOSE 11434
+
+ENTRYPOINT ["/entrypoint.sh"]
+CMD []

From df9533801dbd3ee0935579d8b504cc89afb66c4c Mon Sep 17 00:00:00 2001
From: Damian Stewart <ot@damianstewart.com>
Date: Thu, 15 Jan 2026 11:47:00 +0100
Subject: [PATCH 02/11] wip: docker image running

---
 README.md                    |  15 +++
 docker/Dockerfile.cc-vec-bot | 177 +++++++++++++++++++++++++++++++++++
 docker/cc-vec-bot.docker     |  58 ------------
 3 files changed, 192 insertions(+), 58 deletions(-)
 create mode 100644 docker/Dockerfile.cc-vec-bot
 delete mode 100644 docker/cc-vec-bot.docker

diff --git a/README.md b/README.md
index ad49916..92e6956 100644
--- a/README.md
+++ b/README.md
@@ -140,6 +140,21 @@ export AWS_SECRET_ACCESS_KEY=your-secret
 uv run cc-vec index --url-patterns "%.edu" --limit 10
 ```
 
+damian: 
+# Set environment variables        
+export OPENAI_BASE_URL=http://localhost:8321/v1
+export OPENAI_API_KEY=none # Llama Stack doesn't require a real key
+export OPENAI_EMBEDDING_MODEL=ollama/nomic-embed-text:latest
+export OPENAI_EMBEDDING_DIMENSIONS=768
+
+# Set your Athena credentials
+export ATHENA_OUTPUT_BUCKET=s3://cc-vec-damian-01/test-results
+export AWS_PROFILE=cc-volunteers
+export AWS_DEFAULT_REGION=us-east-1
+
+# Use cc-vec with local models
+uv run cc-vec index --url-patterns "%.edu" --limit 10
+
 **Documentation:**
 - [Llama Stack Docs](https://llamastack.github.io/)
 - [Llama Stack GitHub](https://github.com/meta-llama/llama-stack)
diff --git a/docker/Dockerfile.cc-vec-bot b/docker/Dockerfile.cc-vec-bot
new file mode 100644
index 0000000..7f33aa9
--- /dev/null
+++ b/docker/Dockerfile.cc-vec-bot
@@ -0,0 +1,177 @@
+FROM llamastack/distribution-starter:0.4.1
+LABEL maintainer="damian@commoncrawl.org"
+
+USER root
+
+# Install minimal dependencies required by the Ollama install script
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends curl ca-certificates gnupg zstd \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Ollama (for local inference when OLLAMA_URL is not set)
+RUN curl -fsSL https://ollama.ai/install.sh | sh
+
+ENV PATH="/usr/local/bin:${PATH}"
+
+# ---------------------------------------------------------------------------
+# Build-time model pre-fetch (optional)
+# Set PREFETCH_MODEL=1 to bake the model into the image at build time.
+# This makes the image larger but faster to start.
+#
+# Build examples:
+#   docker build --build-arg PREFETCH_MODEL=1 -t cc-vec-bot .           # bake tinyllama
+#   docker build --build-arg PREFETCH_MODEL=1 --build-arg INFERENCE_MODEL=llama3.2:3b -t cc-vec-bot .
+#   docker build -t cc-vec-bot .                                         # no prefetch (default)
+# ---------------------------------------------------------------------------
+ARG PREFETCH_MODEL=0
+ARG INFERENCE_MODEL=tinyllama
+
+# Pre-fetch model at build time if PREFETCH_MODEL=1
+# Requires starting ollama serve temporarily during build
+RUN if [ "$PREFETCH_MODEL" = "1" ]; then \
+        echo "Pre-fetching model: ${INFERENCE_MODEL}"; \
+        ollama serve & \
+        OLLAMA_PID=$!; \
+        sleep 5; \
+        for i in 1 2 3 4 5 6 7 8 9 10; do \
+            curl -s http://localhost:11434/api/tags >/dev/null 2>&1 && break; \
+            sleep 2; \
+        done; \
+        ollama pull "${INFERENCE_MODEL}"; \
+        kill $OLLAMA_PID 2>/dev/null || true; \
+        echo "Model ${INFERENCE_MODEL} pre-fetched successfully"; \
+    else \
+        echo "Skipping model pre-fetch (PREFETCH_MODEL=0)"; \
+    fi
+
+# ---------------------------------------------------------------------------
+# Compatibility with deprecated llamastack/distribution-ollama
+# Usage:
+#   export LLAMA_STACK_PORT=5001
+#   docker run -it \
+#     -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+#     -v ~/.llama:/root/.llama \
+#     cc-vec-bot \
+#     --port $LLAMA_STACK_PORT \
+#     --env INFERENCE_MODEL=tinyllama \
+#     --env OLLAMA_URL=http://host.docker.internal:11434
+#
+# Or with built-in Ollama (no external Ollama needed):
+#   docker run -it \
+#     -p 5001:5001 -p 11434:11434 \
+#     -v ~/.llama:/root/.llama \
+#     cc-vec-bot \
+#     --port 5001 \
+#     --env INFERENCE_MODEL=tinyllama
+# ---------------------------------------------------------------------------
+
+# Default model (inherits from build ARG, can be overridden at runtime)
+# tinyllama ~637MB (smallest practical LLM)
+# all-minilm ~45MB (embeddings only)
+# llama3.2:3b ~2GB (production)
+ENV INFERENCE_MODEL=${INFERENCE_MODEL}
+
+# Default ports
+ENV LLAMA_STACK_PORT=5001
+ENV OLLAMA_PORT=11434
+
+# Create entrypoint script compatible with distribution-ollama CLI args
+RUN cat <<'EOF' > /entrypoint.sh
+#!/bin/bash
+set -e
+
+# Parse --port and --env arguments (compatible with distribution-ollama)
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --port)
+            LLAMA_STACK_PORT="$2"
+            shift 2
+            ;;
+        --env)
+            # Parse KEY=VALUE and export it
+            if [[ "$2" =~ ^([^=]+)=(.*)$ ]]; then
+                export "${BASH_REMATCH[1]}"="${BASH_REMATCH[2]}"
+            fi
+            shift 2
+            ;;
+        *)
+            # Unknown option, pass through
+            EXTRA_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+
+echo "=============================================="
+echo "cc-vec-bot (llama-stack + ollama)"
+echo "=============================================="
+echo "LLAMA_STACK_PORT: ${LLAMA_STACK_PORT:-5001}"
+echo "INFERENCE_MODEL:  ${INFERENCE_MODEL:-tinyllama}"
+echo "OLLAMA_URL:       ${OLLAMA_URL:-<local>}"
+echo "=============================================="
+
+# Determine Ollama URL
+if [ -z "$OLLAMA_URL" ]; then
+    # No external Ollama URL provided - start local Ollama
+    echo "Starting local Ollama server..."
+    ollama serve &
+    OLLAMA_PID=$!
+    OLLAMA_URL="http://localhost:11434"
+    export OLLAMA_URL
+
+    # Wait for Ollama to be ready
+    echo "Waiting for Ollama to start..."
+    for i in {1..30}; do
+        if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
+            echo "Ollama is ready"
+            break
+        fi
+        if [ $i -eq 30 ]; then
+            echo "ERROR: Ollama failed to start"
+            exit 1
+        fi
+        sleep 1
+    done
+
+    # Pull model if specified
+    if [ -n "$INFERENCE_MODEL" ]; then
+        echo "Pulling model: $INFERENCE_MODEL"
+        ollama pull "$INFERENCE_MODEL"
+    fi
+else
+    # External Ollama URL provided - verify connectivity
+    echo "Using external Ollama at: $OLLAMA_URL"
+    for i in {1..10}; do
+        if curl -s "${OLLAMA_URL}/api/tags" >/dev/null 2>&1; then
+            echo "External Ollama is reachable"
+            break
+        fi
+        if [ $i -eq 10 ]; then
+            echo "WARNING: Cannot reach external Ollama at $OLLAMA_URL"
+        fi
+        sleep 1
+    done
+fi
+
+# Start llama-stack server
+# The distribution-starter base image includes llama-stack
+echo "Starting llama-stack on port ${LLAMA_STACK_PORT}..."
+export OLLAMA_URL="${OLLAMA_URL}"
+export INFERENCE_MODEL="${INFERENCE_MODEL}"
+exec llama stack run starter \
+    --port "${LLAMA_STACK_PORT}" \
+    "${EXTRA_ARGS[@]}"
+EOF
+#--env OLLAMA_URL="${OLLAMA_URL}" \
+#--env INFERENCE_MODEL="${INFERENCE_MODEL}" \
+
+RUN chmod +x /entrypoint.sh
+
+# Volume for llama-stack config and model cache
+VOLUME ["/root/.llama"]
+
+# Expose both llama-stack and ollama ports
+EXPOSE 5001 11434
+
+ENTRYPOINT ["/entrypoint.sh"]
+CMD []
diff --git a/docker/cc-vec-bot.docker b/docker/cc-vec-bot.docker
deleted file mode 100644
index b4e5f86..0000000
--- a/docker/cc-vec-bot.docker
+++ /dev/null
@@ -1,58 +0,0 @@
-FROM llamastack/distribution-starter:0.4.1
-LABEL maintainer="damian@commoncrawl.org"
-
-USER root
-
-# Install minimal dependencies required by the Ollama install script
-RUN apt-get update \
-    && apt-get install -y --no-install-recommends curl ca-certificates gnupg \
-    && rm -rf /var/lib/apt/lists/*
-
-# Install Ollama
-RUN curl -fsSL https://ollama.ai/install.sh | sh
-
-ENV PATH="/usr/local/bin:${PATH}"
-
-# Default model to pull on startup (tinyllama ~637MB, smallest practical LLM)
-# For even smaller (embeddings only): all-minilm (~45MB)
-# For production: llama3.2:3b (~2GB)
-ENV OLLAMA_MODEL="tinyllama"
-
-# Create entrypoint script that starts ollama and pulls model on first run
-RUN cat <<'EOF' > /entrypoint.sh
-#!/bin/bash
-set -e
-
-# Start ollama server in background
-ollama serve &
-OLLAMA_PID=$!
-
-# Wait for ollama to be ready
-echo "Waiting for Ollama to start..."
-for i in {1..30}; do
-    if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
-        echo "Ollama is ready"
-        break
-    fi
-    sleep 1
-done
-
-# Pull model if not already present
-if [ -n "$OLLAMA_MODEL" ]; then
-    echo "Ensuring model $OLLAMA_MODEL is available..."
-    ollama pull "$OLLAMA_MODEL"
-fi
-
-# If a command was passed, run it; otherwise wait on ollama
-if [ $# -gt 0 ]; then
-    exec "$@"
-else
-    wait $OLLAMA_PID
-fi
-EOF
-RUN chmod +x /entrypoint.sh
-
-EXPOSE 11434
-
-ENTRYPOINT ["/entrypoint.sh"]
-CMD []

From 183fbaa1f8178f2276fee93ef228b964d5414a8e Mon Sep 17 00:00:00 2001
From: Damian Stewart <ot@damianstewart.com>
Date: Thu, 15 Jan 2026 12:19:21 +0100
Subject: [PATCH 03/11] wip: non-streaming works

---
 chatbot-frontend/README.md        | 32 ++++++++++++++++++++++++
 chatbot-frontend/api.py           | 41 +++++++++++++++++++++++++++++++
 chatbot-frontend/index.html       | 38 ++++++++++++++++++++++++++++
 chatbot-frontend/requirements.txt |  3 +++
 docker/Dockerfile.cc-vec-bot      | 21 +++++++++++-----
 5 files changed, 129 insertions(+), 6 deletions(-)
 create mode 100644 chatbot-frontend/README.md
 create mode 100644 chatbot-frontend/api.py
 create mode 100644 chatbot-frontend/index.html
 create mode 100644 chatbot-frontend/requirements.txt

diff --git a/chatbot-frontend/README.md b/chatbot-frontend/README.md
new file mode 100644
index 0000000..70f1e4d
--- /dev/null
+++ b/chatbot-frontend/README.md
@@ -0,0 +1,32 @@
+# Minimal Llama Chatbot Frontend
+
+This directory contains a minimal web-based chatbot UI and a FastAPI backend that proxies requests to an Ollama (Llama) backend.
+
+## Usage
+
+1. **Install dependencies**
+
+```bash
+pip install -r requirements.txt
+```
+
+2. **Run the backend**
+
+```bash
+uvicorn api:app --reload
+```
+
+3. **Open the frontend**
+
+Open `index.html` in your browser (or serve it with any static file server).
+
+## Configuration
+
+- The backend expects an Ollama server running at `http://localhost:11434` by default.
+- You can override the Ollama URL and model with environment variables:
+  - `OLLAMA_URL` (e.g. `http://localhost:11434/api/generate`)
+  - `INFERENCE_MODEL` (e.g. `tinyllama`)
+
+## Notes
+- The backend is intentionally minimal and does not persist chat history.
+- The frontend is pure HTML/JS, no frameworks or build step required.
diff --git a/chatbot-frontend/api.py b/chatbot-frontend/api.py
new file mode 100644
index 0000000..0bb7d7a
--- /dev/null
+++ b/chatbot-frontend/api.py
@@ -0,0 +1,41 @@
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse, StreamingResponse, FileResponse
+import httpx
+import os
+import json
+
+app = FastAPI()
+OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434/api/generate")
+INFERENCE_MODEL = os.environ.get("INFERENCE_MODEL", "tinyllama")
+STREAMING = os.environ.get("OLLAMA_STREAMING", "0") != "0"
+
+@app.get("/")
+async def serve_index():
+    return FileResponse("index.html")
+
+@app.post("/api/chat")
+async def chat(request: Request):
+    data = await request.json()
+    prompt = data.get("message", "")
+    payload = {"model": INFERENCE_MODEL, "prompt": prompt, "stream": STREAMING}
+    async with httpx.AsyncClient() as client:
+        r = await client.post(OLLAMA_URL, json=payload, timeout=None)
+        r.raise_for_status()
+        if STREAMING:
+            # Stream JSON lines to the frontend as a single event stream
+            async def event_stream():
+                async for line in r.aiter_lines():
+                    line = line.strip()
+                    print(line)
+                    if not line:
+                        continue
+                    try:
+                        obj = json.loads(line)
+                        # Only send the 'response' field
+                        yield f"data: {json.dumps({'response': obj.get('response', '')})}\n\n"
+                    except Exception:
+                        continue
+            return StreamingResponse(event_stream(), media_type="text/event-stream")
+        else:
+            result = r.json()
+            return JSONResponse({"response": result.get("response", "")})
diff --git a/chatbot-frontend/index.html b/chatbot-frontend/index.html
new file mode 100644
index 0000000..a931bf9
--- /dev/null
+++ b/chatbot-frontend/index.html
@@ -0,0 +1,38 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <title>Llama Chatbot</title>
+  <style>
+    body { font-family: sans-serif; max-width: 600px; margin: 2em auto; }
+    #chat { width: 100%; height: 300px; }
+    #input { width: 100%; }
+  </style>
+</head>
+<body>
+  <h2>Llama Chatbot</h2>
+  <textarea id="chat" readonly></textarea><br>
+  <input id="input" autocomplete="off" placeholder="Type your message..." />
+  <button onclick="send()">Send</button>
+  <script>
+    async function send() {
+      const input = document.getElementById('input');
+      const chat = document.getElementById('chat');
+      const msg = input.value;
+      if (!msg) return;
+      chat.value += "You: " + msg + "\n";
+      input.value = "";
+      const res = await fetch('/api/chat', {
+        method: 'POST',
+        headers: {'Content-Type': 'application/json'},
+        body: JSON.stringify({message: msg})
+      });
+      const data = await res.json();
+      chat.value += "Bot: " + data.response + "\n";
+      chat.scrollTop = chat.scrollHeight;
+    }
+    document.getElementById('input').addEventListener('keydown', e => {
+      if (e.key === 'Enter') send();
+    });
+  </script>
+</body>
+</html>
diff --git a/chatbot-frontend/requirements.txt b/chatbot-frontend/requirements.txt
new file mode 100644
index 0000000..c937678
--- /dev/null
+++ b/chatbot-frontend/requirements.txt
@@ -0,0 +1,3 @@
+fastapi
+httpx
+uvicorn
diff --git a/docker/Dockerfile.cc-vec-bot b/docker/Dockerfile.cc-vec-bot
index 7f33aa9..16a7131 100644
--- a/docker/Dockerfile.cc-vec-bot
+++ b/docker/Dockerfile.cc-vec-bot
@@ -5,7 +5,7 @@ USER root
 
 # Install minimal dependencies required by the Ollama install script
 RUN apt-get update \
-    && apt-get install -y --no-install-recommends curl ca-certificates gnupg zstd \
+    && apt-get install -y --no-install-recommends curl ca-certificates gnupg \
     && rm -rf /var/lib/apt/lists/*
 
 # Install Ollama (for local inference when OLLAMA_URL is not set)
@@ -75,6 +75,17 @@ ENV INFERENCE_MODEL=${INFERENCE_MODEL}
 ENV LLAMA_STACK_PORT=5001
 ENV OLLAMA_PORT=11434
 
+
+# ---------------------------------------------------------------------------
+# Install chatbot-frontend
+# ---------------------------------------------------------------------------
+
+
+
+# ---------------------------------------------------------------------------
+# Construct entrypoint script
+# ---------------------------------------------------------------------------
+
 # Create entrypoint script compatible with distribution-ollama CLI args
 RUN cat <<'EOF' > /entrypoint.sh
 #!/bin/bash
@@ -156,14 +167,12 @@ fi
 # Start llama-stack server
 # The distribution-starter base image includes llama-stack
 echo "Starting llama-stack on port ${LLAMA_STACK_PORT}..."
-export OLLAMA_URL="${OLLAMA_URL}"
-export INFERENCE_MODEL="${INFERENCE_MODEL}"
-exec llama stack run starter \
+exec llama stack run /root/.llama/distributions/ollama/run.yaml \
     --port "${LLAMA_STACK_PORT}" \
+    --env OLLAMA_URL="${OLLAMA_URL}" \
+    --env INFERENCE_MODEL="${INFERENCE_MODEL}" \
     "${EXTRA_ARGS[@]}"
 EOF
-#--env OLLAMA_URL="${OLLAMA_URL}" \
-#--env INFERENCE_MODEL="${INFERENCE_MODEL}" \
 
 RUN chmod +x /entrypoint.sh
 

From 0b6213abdbd96bb346eafeba5d9ffa6ddee41444 Mon Sep 17 00:00:00 2001
From: Damian Stewart <ot@damianstewart.com>
Date: Thu, 15 Jan 2026 12:38:51 +0100
Subject: [PATCH 04/11] wip: chatbot works inside docker container

---
 chatbot-frontend/api.py      |  1 +
 chatbot-frontend/index.html  |  3 ++-
 docker/Dockerfile.cc-vec-bot | 24 ++++++++++++++++++------
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/chatbot-frontend/api.py b/chatbot-frontend/api.py
index 0bb7d7a..755a753 100644
--- a/chatbot-frontend/api.py
+++ b/chatbot-frontend/api.py
@@ -11,6 +11,7 @@
 
 @app.get("/")
 async def serve_index():
+    print("serving index.html")
     return FileResponse("index.html")
 
 @app.post("/api/chat")
diff --git a/chatbot-frontend/index.html b/chatbot-frontend/index.html
index a931bf9..282f323 100644
--- a/chatbot-frontend/index.html
+++ b/chatbot-frontend/index.html
@@ -9,7 +9,8 @@
   </style>
 </head>
 <body>
-  <h2>Llama Chatbot</h2>
+  <h2>Creative Commons Chatbot</h2>
+  <p>powered by Llama Stack</p>
   <textarea id="chat" readonly></textarea><br>
   <input id="input" autocomplete="off" placeholder="Type your message..." />
   <button onclick="send()">Send</button>
diff --git a/docker/Dockerfile.cc-vec-bot b/docker/Dockerfile.cc-vec-bot
index 16a7131..d6a71a6 100644
--- a/docker/Dockerfile.cc-vec-bot
+++ b/docker/Dockerfile.cc-vec-bot
@@ -5,7 +5,7 @@ USER root
 
 # Install minimal dependencies required by the Ollama install script
 RUN apt-get update \
-    && apt-get install -y --no-install-recommends curl ca-certificates gnupg \
+    && apt-get install -y --no-install-recommends curl ca-certificates gnupg zstd \
     && rm -rf /var/lib/apt/lists/*
 
 # Install Ollama (for local inference when OLLAMA_URL is not set)
@@ -74,13 +74,18 @@ ENV INFERENCE_MODEL=${INFERENCE_MODEL}
 # Default ports
 ENV LLAMA_STACK_PORT=5001
 ENV OLLAMA_PORT=11434
-
+ENV CHATBOT_PORT=8008
 
 # ---------------------------------------------------------------------------
 # Install chatbot-frontend
 # ---------------------------------------------------------------------------
 
-
+WORKDIR /opt/chatbot-frontend
+COPY chatbot-frontend .
+RUN apt-get update && apt-get install -y python3-pip && \
+    pip3 install --no-cache-dir -r requirements.txt && \
+    apt-get remove -y python3-pip && apt-get autoremove -y && \
+    rm -rf /var/lib/apt/lists/*
 
 # ---------------------------------------------------------------------------
 # Construct entrypoint script
@@ -164,13 +169,20 @@ else
     done
 fi
 
+# Start the chatbot in the background
+echo "Starting chatbot-frontend..."
+(cd /opt/chatbot-frontend && uvicorn api:app --host 0.0.0.0 --port ${CHATBOT_PORT:-8000} &)
+#(cd /opt/chatbot-frontend && uvicorn api:app --host 0.0.0.0 --port ${CHATBOT_PORT:-8000})
+CHATBOT_PID=$!
+echo "Chatbot-frontend started with PID ${CHATBOT_PID}"
+
 # Start llama-stack server
 # The distribution-starter base image includes llama-stack
 echo "Starting llama-stack on port ${LLAMA_STACK_PORT}..."
-exec llama stack run /root/.llama/distributions/ollama/run.yaml \
+export OLLAMA_URL="${OLLAMA_URL}"
+export INFERENCE_MODEL="${INFERENCE_MODEL}"
+exec llama stack run starter \
     --port "${LLAMA_STACK_PORT}" \
-    --env OLLAMA_URL="${OLLAMA_URL}" \
-    --env INFERENCE_MODEL="${INFERENCE_MODEL}" \
     "${EXTRA_ARGS[@]}"
 EOF
 

From 5264bd2e0bbf271eba262471293d4062d915685b Mon Sep 17 00:00:00 2001
From: Damian Stewart <ot@damianstewart.com>
Date: Thu, 15 Jan 2026 13:38:11 +0100
Subject: [PATCH 05/11] feat: working chatbot

---
 chatbot-frontend/api.py    |  2 +-
 docker/.env.sample         | 24 ++++++++++++++++++++++++
 docker/.gitignore          |  1 +
 docker/docker-compose.yaml | 21 +++++++++++++++++++++
 4 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 docker/.env.sample
 create mode 100644 docker/.gitignore
 create mode 100644 docker/docker-compose.yaml

diff --git a/chatbot-frontend/api.py b/chatbot-frontend/api.py
index 755a753..992af70 100644
--- a/chatbot-frontend/api.py
+++ b/chatbot-frontend/api.py
@@ -5,7 +5,7 @@
 import json
 
 app = FastAPI()
-OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434/api/generate")
+OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434") + "/api/generate"
 INFERENCE_MODEL = os.environ.get("INFERENCE_MODEL", "tinyllama")
 STREAMING = os.environ.get("OLLAMA_STREAMING", "0") != "0"
 
diff --git a/docker/.env.sample b/docker/.env.sample
new file mode 100644
index 0000000..8ce777f
--- /dev/null
+++ b/docker/.env.sample
@@ -0,0 +1,24 @@
+# Docker Compose Configuration for cc-vec-bot
+# Copy this file to .env and customize values as needed
+
+# Run: docker compose up --build
+
+# Inference model to use (tinyllama, llama2, llama3.2:3b, etc.)
+# recommended: tinyllama for local testing (700MB), llama3.2:3B for production
+INFERENCE_MODEL=tinyllama
+
+# LLM model files are large.
+# Set to 1 to pre-fetch the model at build time (increases image size and build time)
+# Set to 0 to fetch when the image is run (smaller, faster build but redundant runtime fetch of large models)
+PREFETCH_MODEL=1
+
+# Ports
+LLAMA_STACK_PORT=5001
+CHATBOT_PORT=8008
+OLLAMA_PORT=11434
+
+# External ollama/chromadb URL
+# Set these if you want to customize llama-stack's behavior
+# OLLAMA_URL=http://localhost:11434
+# CHROMADB_URL=http://localhost:8000
+
diff --git a/docker/.gitignore b/docker/.gitignore
new file mode 100644
index 0000000..4c49bd7
--- /dev/null
+++ b/docker/.gitignore
@@ -0,0 +1 @@
+.env
diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml
new file mode 100644
index 0000000..9af9646
--- /dev/null
+++ b/docker/docker-compose.yaml
@@ -0,0 +1,21 @@
+services:
+  cc-vec-bot:
+    build:
+      context: ..
+      dockerfile: docker/Dockerfile.cc-vec-bot
+      args:
+        PREFETCH_MODEL: ${PREFETCH_MODEL:-0}
+        INFERENCE_MODEL: ${INFERENCE_MODEL:-tinyllama}
+    image: cc-vec-bot
+    environment:
+      LLAMA_STACK_PORT: ${LLAMA_STACK_PORT:-5001}
+      CHATBOT_PORT: ${CHATBOT_PORT:-8008}
+      OLLAMA_PORT: ${OLLAMA_PORT:-11434}
+    ports:
+      - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
+      - "${CHATBOT_PORT:-8008}:${CHATBOT_PORT:-8008}"
+      - "${OLLAMA_PORT:-11434}:${OLLAMA_PORT:-11434}"
+    volumes:
+      - ~/.llama:/root/.llama
+    command: ["--port", "${LLAMA_STACK_PORT:-5001}"]
+

From 9321c074a7a625f41478d48aeb97ea48cbf734de Mon Sep 17 00:00:00 2001
From: Damian Stewart <ot@damianstewart.com>
Date: Thu, 15 Jan 2026 13:50:51 +0100
Subject: [PATCH 06/11] feat: chatbot streaming, readme cleanup

---
 chatbot-frontend/README.md   | 30 ++++++++---------------
 chatbot-frontend/api.py      | 47 +++++++++++++++++++++---------------
 chatbot-frontend/index.html  | 40 ++++++++++++++++++++++++++++--
 docker/Dockerfile.cc-vec-bot |  7 ++----
 docker/README.md             | 11 +++++++++
 docker/docker-compose.yaml   |  2 +-
 6 files changed, 89 insertions(+), 48 deletions(-)
 create mode 100644 docker/README.md

diff --git a/chatbot-frontend/README.md b/chatbot-frontend/README.md
index 70f1e4d..3b06cc1 100644
--- a/chatbot-frontend/README.md
+++ b/chatbot-frontend/README.md
@@ -1,32 +1,22 @@
-# Minimal Llama Chatbot Frontend
+# Minimal CC Chatbot Frontend
 
-This directory contains a minimal web-based chatbot UI and a FastAPI backend that proxies requests to an Ollama (Llama) backend.
+## Quickstart
 
-## Usage
-
-1. **Install dependencies**
+Launch ollama server, then run:
 
 ```bash
 pip install -r requirements.txt
-```
-
-2. **Run the backend**
-
-```bash
 uvicorn api:app --reload
 ```
 
-3. **Open the frontend**
-
-Open `index.html` in your browser (or serve it with any static file server).
+Click on the link that uvicorn prints in your terminal to open the frontend.
 
 ## Configuration
 
-- The backend expects an Ollama server running at `http://localhost:11434` by default.
-- You can override the Ollama URL and model with environment variables:
-  - `OLLAMA_URL` (e.g. `http://localhost:11434/api/generate`)
-  - `INFERENCE_MODEL` (e.g. `tinyllama`)
+Configuration is done via environment variables in `api.py`. Defaults:
 
-## Notes
-- The backend is intentionally minimal and does not persist chat history.
-- The frontend is pure HTML/JS, no frameworks or build step required.
+```bash
+OLLAMA_URL=http://localhost:11434 # /api/generate is appended
+INFERENCE_MODEL=tinyllama
+STREAMING=1
+````
diff --git a/chatbot-frontend/api.py b/chatbot-frontend/api.py
index 992af70..0091cbc 100644
--- a/chatbot-frontend/api.py
+++ b/chatbot-frontend/api.py
@@ -7,7 +7,7 @@
 app = FastAPI()
 OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434") + "/api/generate"
 INFERENCE_MODEL = os.environ.get("INFERENCE_MODEL", "tinyllama")
-STREAMING = os.environ.get("OLLAMA_STREAMING", "0") != "0"
+STREAMING = os.environ.get("OLLAMA_STREAMING", "1") != "0"
 
 @app.get("/")
 async def serve_index():
@@ -19,24 +19,31 @@ async def chat(request: Request):
     data = await request.json()
     prompt = data.get("message", "")
     payload = {"model": INFERENCE_MODEL, "prompt": prompt, "stream": STREAMING}
-    async with httpx.AsyncClient() as client:
-        r = await client.post(OLLAMA_URL, json=payload, timeout=None)
-        r.raise_for_status()
-        if STREAMING:
-            # Stream JSON lines to the frontend as a single event stream
-            async def event_stream():
-                async for line in r.aiter_lines():
-                    line = line.strip()
-                    print(line)
-                    if not line:
-                        continue
-                    try:
-                        obj = json.loads(line)
-                        # Only send the 'response' field
-                        yield f"data: {json.dumps({'response': obj.get('response', '')})}\n\n"
-                    except Exception:
-                        continue
-            return StreamingResponse(event_stream(), media_type="text/event-stream")
-        else:
+
+    if STREAMING:
+        # Stream JSON lines to the frontend as SSE
+        async def event_stream():
+            async with httpx.AsyncClient(timeout=None) as client:
+                async with client.stream("POST", OLLAMA_URL, json=payload) as r:
+                    r.raise_for_status()
+                    async for line in r.aiter_lines():
+                        line = line.strip()
+                        if not line:
+                            continue
+                        try:
+                            obj = json.loads(line)
+                            # Only send the 'response' field
+                            response_text = obj.get('response', '')
+                            if response_text:
+                                yield f"data: {json.dumps({'response': response_text})}\n\n"
+                        except Exception as e:
+                            print(f"Error parsing line: {e}")
+                            continue
+        return StreamingResponse(event_stream(), media_type="text/event-stream")
+    else:
+        # Non-streaming mode
+        async with httpx.AsyncClient(timeout=None) as client:
+            r = await client.post(OLLAMA_URL, json=payload)
+            r.raise_for_status()
             result = r.json()
             return JSONResponse({"response": result.get("response", "")})
diff --git a/chatbot-frontend/index.html b/chatbot-frontend/index.html
index 282f323..a3076df 100644
--- a/chatbot-frontend/index.html
+++ b/chatbot-frontend/index.html
@@ -22,13 +22,49 @@ <h2>Creative Commons Chatbot</h2>
       if (!msg) return;
       chat.value += "You: " + msg + "\n";
       input.value = "";
+
       const res = await fetch('/api/chat', {
         method: 'POST',
         headers: {'Content-Type': 'application/json'},
         body: JSON.stringify({message: msg})
       });
-      const data = await res.json();
-      chat.value += "Bot: " + data.response + "\n";
+
+      // Check if response is streaming (SSE) or regular JSON
+      const contentType = res.headers.get('content-type');
+      if (contentType && contentType.includes('text/event-stream')) {
+        // Handle streaming response
+        chat.value += "Bot: ";
+        const reader = res.body.getReader();
+        const decoder = new TextDecoder();
+        let buffer = '';
+
+        while (true) {
+          const {done, value} = await reader.read();
+          if (done) break;
+
+          buffer += decoder.decode(value, {stream: true});
+          const lines = buffer.split('\n');
+          buffer = lines.pop(); // Keep incomplete line in buffer
+
+          for (const line of lines) {
+            if (line.startsWith('data: ')) {
+              try {
+                const data = JSON.parse(line.slice(6));
+                chat.value += data.response || '';
+                chat.scrollTop = chat.scrollHeight;
+              } catch (e) {
+                // Skip malformed JSON
+              }
+            }
+          }
+        }
+        chat.value += "\n";
+      } else {
+        // Handle non-streaming JSON response
+        const data = await res.json();
+        chat.value += "Bot: " + data.response + "\n";
+      }
+
       chat.scrollTop = chat.scrollHeight;
     }
     document.getElementById('input').addEventListener('keydown', e => {
diff --git a/docker/Dockerfile.cc-vec-bot b/docker/Dockerfile.cc-vec-bot
index d6a71a6..f2f7271 100644
--- a/docker/Dockerfile.cc-vec-bot
+++ b/docker/Dockerfile.cc-vec-bot
@@ -5,7 +5,7 @@ USER root
 
 # Install minimal dependencies required by the Ollama install script
 RUN apt-get update \
-    && apt-get install -y --no-install-recommends curl ca-certificates gnupg zstd \
+    && apt-get install -y --no-install-recommends curl ca-certificates gnupg zstd python3-pip \
     && rm -rf /var/lib/apt/lists/*
 
 # Install Ollama (for local inference when OLLAMA_URL is not set)
@@ -82,10 +82,7 @@ ENV CHATBOT_PORT=8008
 
 WORKDIR /opt/chatbot-frontend
 COPY chatbot-frontend .
-RUN apt-get update && apt-get install -y python3-pip && \
-    pip3 install --no-cache-dir -r requirements.txt && \
-    apt-get remove -y python3-pip && apt-get autoremove -y && \
-    rm -rf /var/lib/apt/lists/*
+RUN pip3 install --no-cache-dir -r requirements.txt
 
 # ---------------------------------------------------------------------------
 # Construct entrypoint script
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 0000000..0ea6a30
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,11 @@
+# CC chatbot docker setup
+
+## Quickstart
+
+To run with default configuration (internal ollama, tinyllama baked into image, chatbot on http://localhost:8008), run:
+
+`docker-compose up --build`
+
+## Configuration
+
+Copy `.env.sample` to `.env` and modify as needed to customize configuration.
diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml
index 9af9646..0c31430 100644
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@@ -4,7 +4,7 @@ services:
       context: ..
       dockerfile: docker/Dockerfile.cc-vec-bot
       args:
-        PREFETCH_MODEL: ${PREFETCH_MODEL:-0}
+        PREFETCH_MODEL: ${PREFETCH_MODEL:-1}
         INFERENCE_MODEL: ${INFERENCE_MODEL:-tinyllama}
     image: cc-vec-bot
     environment:

From 01ce32c4994ce38134ef083e1895de70aabd892c Mon Sep 17 00:00:00 2001
From: Damian Stewart <ot@damianstewart.com>
Date: Thu, 15 Jan 2026 13:57:44 +0100
Subject: [PATCH 07/11] chore: simplify dockerfile by making entrypoint.sh its
 own file

---
 docker/.env.sample           |   3 ++
 docker/Dockerfile.cc-vec-bot | 101 ++---------------------------------
 docker/docker-compose.yaml   |  10 ++--
 docker/entrypoint.sh         |  93 ++++++++++++++++++++++++++++++++
 4 files changed, 107 insertions(+), 100 deletions(-)
 create mode 100644 docker/entrypoint.sh

diff --git a/docker/.env.sample b/docker/.env.sample
index 8ce777f..3e7be46 100644
--- a/docker/.env.sample
+++ b/docker/.env.sample
@@ -17,6 +17,9 @@ LLAMA_STACK_PORT=5001
 CHATBOT_PORT=8008
 OLLAMA_PORT=11434
 
+# Streaming mode for chatbot responses (0=off, 1=on)
+OLLAMA_STREAMING=1
+
 # External ollama/chromadb URL
 # Set these if you want to customize llama-stack's behavior
 # OLLAMA_URL=http://localhost:11434
diff --git a/docker/Dockerfile.cc-vec-bot b/docker/Dockerfile.cc-vec-bot
index f2f7271..30f729e 100644
--- a/docker/Dockerfile.cc-vec-bot
+++ b/docker/Dockerfile.cc-vec-bot
@@ -76,6 +76,9 @@ ENV LLAMA_STACK_PORT=5001
 ENV OLLAMA_PORT=11434
 ENV CHATBOT_PORT=8008
 
+# Streaming mode (0=off, 1=on)
+ENV OLLAMA_STREAMING=1
+
 # ---------------------------------------------------------------------------
 # Install chatbot-frontend
 # ---------------------------------------------------------------------------
@@ -85,104 +88,10 @@ COPY chatbot-frontend .
 RUN pip3 install --no-cache-dir -r requirements.txt
 
 # ---------------------------------------------------------------------------
-# Construct entrypoint script
+# Copy and setup entrypoint script
 # ---------------------------------------------------------------------------
 
-# Create entrypoint script compatible with distribution-ollama CLI args
-RUN cat <<'EOF' > /entrypoint.sh
-#!/bin/bash
-set -e
-
-# Parse --port and --env arguments (compatible with distribution-ollama)
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --port)
-            LLAMA_STACK_PORT="$2"
-            shift 2
-            ;;
-        --env)
-            # Parse KEY=VALUE and export it
-            if [[ "$2" =~ ^([^=]+)=(.*)$ ]]; then
-                export "${BASH_REMATCH[1]}"="${BASH_REMATCH[2]}"
-            fi
-            shift 2
-            ;;
-        *)
-            # Unknown option, pass through
-            EXTRA_ARGS+=("$1")
-            shift
-            ;;
-    esac
-done
-
-echo "=============================================="
-echo "cc-vec-bot (llama-stack + ollama)"
-echo "=============================================="
-echo "LLAMA_STACK_PORT: ${LLAMA_STACK_PORT:-5001}"
-echo "INFERENCE_MODEL:  ${INFERENCE_MODEL:-tinyllama}"
-echo "OLLAMA_URL:       ${OLLAMA_URL:-<local>}"
-echo "=============================================="
-
-# Determine Ollama URL
-if [ -z "$OLLAMA_URL" ]; then
-    # No external Ollama URL provided - start local Ollama
-    echo "Starting local Ollama server..."
-    ollama serve &
-    OLLAMA_PID=$!
-    OLLAMA_URL="http://localhost:11434"
-    export OLLAMA_URL
-
-    # Wait for Ollama to be ready
-    echo "Waiting for Ollama to start..."
-    for i in {1..30}; do
-        if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
-            echo "Ollama is ready"
-            break
-        fi
-        if [ $i -eq 30 ]; then
-            echo "ERROR: Ollama failed to start"
-            exit 1
-        fi
-        sleep 1
-    done
-
-    # Pull model if specified
-    if [ -n "$INFERENCE_MODEL" ]; then
-        echo "Pulling model: $INFERENCE_MODEL"
-        ollama pull "$INFERENCE_MODEL"
-    fi
-else
-    # External Ollama URL provided - verify connectivity
-    echo "Using external Ollama at: $OLLAMA_URL"
-    for i in {1..10}; do
-        if curl -s "${OLLAMA_URL}/api/tags" >/dev/null 2>&1; then
-            echo "External Ollama is reachable"
-            break
-        fi
-        if [ $i -eq 10 ]; then
-            echo "WARNING: Cannot reach external Ollama at $OLLAMA_URL"
-        fi
-        sleep 1
-    done
-fi
-
-# Start the chatbot in the background
-echo "Starting chatbot-frontend..."
-(cd /opt/chatbot-frontend && uvicorn api:app --host 0.0.0.0 --port ${CHATBOT_PORT:-8000} &)
-#(cd /opt/chatbot-frontend && uvicorn api:app --host 0.0.0.0 --port ${CHATBOT_PORT:-8000})
-CHATBOT_PID=$!
-echo "Chatbot-frontend started with PID ${CHATBOT_PID}"
-
-# Start llama-stack server
-# The distribution-starter base image includes llama-stack
-echo "Starting llama-stack on port ${LLAMA_STACK_PORT}..."
-export OLLAMA_URL="${OLLAMA_URL}"
-export INFERENCE_MODEL="${INFERENCE_MODEL}"
-exec llama stack run starter \
-    --port "${LLAMA_STACK_PORT}" \
-    "${EXTRA_ARGS[@]}"
-EOF
-
+COPY docker/entrypoint.sh /entrypoint.sh
 RUN chmod +x /entrypoint.sh
 
 # Volume for llama-stack config and model cache
diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml
index 0c31430..e835e85 100644
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@@ -11,11 +11,13 @@ services:
       LLAMA_STACK_PORT: ${LLAMA_STACK_PORT:-5001}
       CHATBOT_PORT: ${CHATBOT_PORT:-8008}
       OLLAMA_PORT: ${OLLAMA_PORT:-11434}
+      OLLAMA_STREAMING: ${OLLAMA_STREAMING:-1}
+      INFERENCE_MODEL: ${INFERENCE_MODEL:-tinyllama}
     ports:
-      - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
-      - "${CHATBOT_PORT:-8008}:${CHATBOT_PORT:-8008}"
-      - "${OLLAMA_PORT:-11434}:${OLLAMA_PORT:-11434}"
+      - "${LLAMA_STACK_PORT}:${LLAMA_STACK_PORT}"
+      - "${CHATBOT_PORT}:${CHATBOT_PORT}"
+      - "${OLLAMA_PORT}:${OLLAMA_PORT}"
     volumes:
       - ~/.llama:/root/.llama
-    command: ["--port", "${LLAMA_STACK_PORT:-5001}"]
+    command: ["--port", "${LLAMA_STACK_PORT}"]
 
diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh
new file mode 100644
index 0000000..cd08123
--- /dev/null
+++ b/docker/entrypoint.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+set -e
+
+EXTRA_ARGS=()
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --port)
+            LLAMA_STACK_PORT="$2"
+            shift 2
+            ;;
+        *)
+            # Unknown option, pass through
+            EXTRA_ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+
+echo "=============================================="
+echo "cc-vec-bot (llama-stack + ollama)"
+echo "=============================================="
+echo "LLAMA_STACK_PORT: ${LLAMA_STACK_PORT:-5001}"
+echo "INFERENCE_MODEL:  ${INFERENCE_MODEL:-tinyllama}"
+echo "OLLAMA_URL:       ${OLLAMA_URL:-<local>}"
+echo "CHATBOT_PORT:     ${CHATBOT_PORT:-8008}"
+echo "=============================================="
+
+# Determine Ollama URL
+if [ -z "$OLLAMA_URL" ]; then
+    # No external Ollama URL provided - start local Ollama
+    echo "Starting local Ollama server..."
+    ollama serve &
+    OLLAMA_PID=$!
+    OLLAMA_URL="http://localhost:11434"
+    export OLLAMA_URL
+
+    # Wait for Ollama to be ready
+    echo "Waiting for Ollama to start..."
+    for i in {1..30}; do
+        if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
+            echo "Ollama is ready"
+            break
+        fi
+        if [ $i -eq 30 ]; then
+            echo "ERROR: Ollama failed to start"
+            exit 1
+        fi
+        sleep 1
+    done
+
+    # Pull model if specified and PREFETCH_MODEL wasn't set at build time
+    if [ -n "$INFERENCE_MODEL" ]; then
+        echo "Checking if model needs to be pulled: $INFERENCE_MODEL"
+        if ! ollama list | grep -q "^${INFERENCE_MODEL}"; then
+            echo "Pulling model: $INFERENCE_MODEL"
+            ollama pull "$INFERENCE_MODEL"
+        else
+            echo "Model $INFERENCE_MODEL already available"
+        fi
+    fi
+else
+    # External Ollama URL provided - verify connectivity
+    echo "Using external Ollama at: $OLLAMA_URL"
+    for i in {1..10}; do
+        if curl -s "${OLLAMA_URL}/api/tags" >/dev/null 2>&1; then
+            echo "External Ollama is reachable"
+            break
+        fi
+        if [ $i -eq 10 ]; then
+            echo "WARNING: Cannot reach external Ollama at $OLLAMA_URL"
+        fi
+        sleep 1
+    done
+fi
+
+# Start the chatbot in the background
+echo "Starting chatbot-frontend on port ${CHATBOT_PORT}..."
+(cd /opt/chatbot-frontend && uvicorn api:app --host 0.0.0.0 --port "${CHATBOT_PORT}" > /var/log/chatbot.log 2>&1 &)
+CHATBOT_PID=$!
+echo "Chatbot-frontend started with PID ${CHATBOT_PID}"
+
+# Give chatbot a moment to start
+sleep 2
+
+# Start llama-stack server
+# The distribution-starter base image includes llama-stack
+echo "Starting llama-stack on port ${LLAMA_STACK_PORT}..."
+export OLLAMA_URL="${OLLAMA_URL}"
+export INFERENCE_MODEL="${INFERENCE_MODEL}"
+exec llama stack run starter \
+    --port "${LLAMA_STACK_PORT}" \
+    "${EXTRA_ARGS[@]}"
+

From 7c876f4a2f2adc59dfd0a84283b31cb1e43cff66 Mon Sep 17 00:00:00 2001
From: Damian Stewart <ot@damianstewart.com>
Date: Thu, 15 Jan 2026 14:16:33 +0100
Subject: [PATCH 08/11] feat: external ollama; simplify instructions

---
 docker/.env.sample           | 12 ++++++++----
 docker/Dockerfile.cc-vec-bot |  1 -
 docker/README.md             |  5 +++++
 docker/docker-compose.yaml   |  8 +++-----
 docker/entrypoint.sh         |  8 +++++++-
 5 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/docker/.env.sample b/docker/.env.sample
index 3e7be46..76d48a2 100644
--- a/docker/.env.sample
+++ b/docker/.env.sample
@@ -15,13 +15,17 @@ PREFETCH_MODEL=1
 # Ports
 LLAMA_STACK_PORT=5001
 CHATBOT_PORT=8008
-OLLAMA_PORT=11434
 
 # Streaming mode for chatbot responses (0=off, 1=on)
 OLLAMA_STREAMING=1
 
-# External ollama/chromadb URL
-# Set these if you want to customize llama-stack's behavior
-# OLLAMA_URL=http://localhost:11434
+# Ollama URL (optional - defaults to local Ollama on port 11434)
+# Leave empty to use built-in Ollama, or set to external instance
+# Examples:
+#   OLLAMA_URL=http://host.docker.internal:11434
+#   OLLAMA_URL=http://192.168.1.100:11435
+# OLLAMA_URL=
+
+# ChromaDB URL (optional - for persistent vector storage)
 # CHROMADB_URL=http://localhost:8000
 
diff --git a/docker/Dockerfile.cc-vec-bot b/docker/Dockerfile.cc-vec-bot
index 30f729e..abbdfce 100644
--- a/docker/Dockerfile.cc-vec-bot
+++ b/docker/Dockerfile.cc-vec-bot
@@ -73,7 +73,6 @@ ENV INFERENCE_MODEL=${INFERENCE_MODEL}
 
 # Default ports
 ENV LLAMA_STACK_PORT=5001
-ENV OLLAMA_PORT=11434
 ENV CHATBOT_PORT=8008
 
 # Streaming mode (0=off, 1=on)
diff --git a/docker/README.md b/docker/README.md
index 0ea6a30..b75ca18 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -9,3 +9,8 @@ To run with default configuration (internal ollama, tinyllama baked into image,
 ## Configuration
 
 Copy `.env.sample` to `.env` and modify as needed to customize configuration.
+
+Alternatively, set environment variables directly in your shell before running the `docker-compose up` command. For example:
+```bash
+OLLAMA_URL=http://host.docker.internal:11434 PREFETCH_MODEL=0 INFERENCE_MODEL=llama3.2:3B  docker-compose up --build
+```
\ No newline at end of file
diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml
index e835e85..1a121eb 100644
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@@ -10,14 +10,12 @@ services:
     environment:
       LLAMA_STACK_PORT: ${LLAMA_STACK_PORT:-5001}
       CHATBOT_PORT: ${CHATBOT_PORT:-8008}
-      OLLAMA_PORT: ${OLLAMA_PORT:-11434}
+      OLLAMA_URL: ${OLLAMA_URL:-}
       OLLAMA_STREAMING: ${OLLAMA_STREAMING:-1}
       INFERENCE_MODEL: ${INFERENCE_MODEL:-tinyllama}
     ports:
-      - "${LLAMA_STACK_PORT}:${LLAMA_STACK_PORT}"
-      - "${CHATBOT_PORT}:${CHATBOT_PORT}"
-      - "${OLLAMA_PORT}:${OLLAMA_PORT}"
+      - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
+      - "${CHATBOT_PORT:-8008}:${CHATBOT_PORT:-8008}"
     volumes:
       - ~/.llama:/root/.llama
-    command: ["--port", "${LLAMA_STACK_PORT}"]
 
diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh
index cd08123..64d9eb9 100644
--- a/docker/entrypoint.sh
+++ b/docker/entrypoint.sh
@@ -75,7 +75,13 @@ fi
 
 # Start the chatbot in the background
 echo "Starting chatbot-frontend on port ${CHATBOT_PORT}..."
-(cd /opt/chatbot-frontend && uvicorn api:app --host 0.0.0.0 --port "${CHATBOT_PORT}" > /var/log/chatbot.log 2>&1 &)
+(cd /opt/chatbot-frontend && uvicorn api:app --host 0.0.0.0 --port "${CHATBOT_PORT}" &)
+SUCCESS=$?
+if [ $SUCCESS -ne 0 ]; then
+    cat /var/log/chatbot.log
+    echo "ERROR: Failed to start chatbot-frontend"
+    exit 1
+fi
 CHATBOT_PID=$!
 echo "Chatbot-frontend started with PID ${CHATBOT_PID}"
 

From 057d1e2b39a871a805a5434203aa45d34ad25411 Mon Sep 17 00:00:00 2001
From: Damian Stewart <ot@damianstewart.com>
Date: Thu, 15 Jan 2026 14:25:15 +0100
Subject: [PATCH 09/11] chore: restructure

---
 {chatbot-frontend => cc-chatbot/chatbot}/README.md        | 0
 {chatbot-frontend => cc-chatbot/chatbot}/api.py           | 0
 {chatbot-frontend => cc-chatbot/chatbot}/index.html       | 0
 {chatbot-frontend => cc-chatbot/chatbot}/requirements.txt | 0
 {docker => cc-chatbot/docker}/.env.sample                 | 0
 {docker => cc-chatbot/docker}/.gitignore                  | 0
 {docker => cc-chatbot/docker}/Dockerfile.cc-vec-bot       | 2 +-
 {docker => cc-chatbot/docker}/README.md                   | 0
 {docker => cc-chatbot/docker}/docker-compose.yaml         | 0
 {docker => cc-chatbot/docker}/entrypoint.sh               | 0
 10 files changed, 1 insertion(+), 1 deletion(-)
 rename {chatbot-frontend => cc-chatbot/chatbot}/README.md (100%)
 rename {chatbot-frontend => cc-chatbot/chatbot}/api.py (100%)
 rename {chatbot-frontend => cc-chatbot/chatbot}/index.html (100%)
 rename {chatbot-frontend => cc-chatbot/chatbot}/requirements.txt (100%)
 rename {docker => cc-chatbot/docker}/.env.sample (100%)
 rename {docker => cc-chatbot/docker}/.gitignore (100%)
 rename {docker => cc-chatbot/docker}/Dockerfile.cc-vec-bot (99%)
 rename {docker => cc-chatbot/docker}/README.md (100%)
 rename {docker => cc-chatbot/docker}/docker-compose.yaml (100%)
 rename {docker => cc-chatbot/docker}/entrypoint.sh (100%)

diff --git a/chatbot-frontend/README.md b/cc-chatbot/chatbot/README.md
similarity index 100%
rename from chatbot-frontend/README.md
rename to cc-chatbot/chatbot/README.md
diff --git a/chatbot-frontend/api.py b/cc-chatbot/chatbot/api.py
similarity index 100%
rename from chatbot-frontend/api.py
rename to cc-chatbot/chatbot/api.py
diff --git a/chatbot-frontend/index.html b/cc-chatbot/chatbot/index.html
similarity index 100%
rename from chatbot-frontend/index.html
rename to cc-chatbot/chatbot/index.html
diff --git a/chatbot-frontend/requirements.txt b/cc-chatbot/chatbot/requirements.txt
similarity index 100%
rename from chatbot-frontend/requirements.txt
rename to cc-chatbot/chatbot/requirements.txt
diff --git a/docker/.env.sample b/cc-chatbot/docker/.env.sample
similarity index 100%
rename from docker/.env.sample
rename to cc-chatbot/docker/.env.sample
diff --git a/docker/.gitignore b/cc-chatbot/docker/.gitignore
similarity index 100%
rename from docker/.gitignore
rename to cc-chatbot/docker/.gitignore
diff --git a/docker/Dockerfile.cc-vec-bot b/cc-chatbot/docker/Dockerfile.cc-vec-bot
similarity index 99%
rename from docker/Dockerfile.cc-vec-bot
rename to cc-chatbot/docker/Dockerfile.cc-vec-bot
index abbdfce..0f0f81a 100644
--- a/docker/Dockerfile.cc-vec-bot
+++ b/cc-chatbot/docker/Dockerfile.cc-vec-bot
@@ -83,7 +83,7 @@ ENV OLLAMA_STREAMING=1
 # ---------------------------------------------------------------------------
 
 WORKDIR /opt/chatbot-frontend
-COPY chatbot-frontend .
+COPY chatbot .
 RUN pip3 install --no-cache-dir -r requirements.txt
 
 # ---------------------------------------------------------------------------
diff --git a/docker/README.md b/cc-chatbot/docker/README.md
similarity index 100%
rename from docker/README.md
rename to cc-chatbot/docker/README.md
diff --git a/docker/docker-compose.yaml b/cc-chatbot/docker/docker-compose.yaml
similarity index 100%
rename from docker/docker-compose.yaml
rename to cc-chatbot/docker/docker-compose.yaml
diff --git a/docker/entrypoint.sh b/cc-chatbot/docker/entrypoint.sh
similarity index 100%
rename from docker/entrypoint.sh
rename to cc-chatbot/docker/entrypoint.sh

From f884d91af784967ab19bb8d31ce7a6311b1fa6f5 Mon Sep 17 00:00:00 2001
From: Damian Stewart <ot@damianstewart.com>
Date: Mon, 19 Jan 2026 14:33:07 +0100
Subject: [PATCH 10/11] feat: docker-compose for convenience; README

---
 cc-chatbot/docker/README.md | 34 +++++++++++++++++++++++++++++++++-
 docker/docker-compose.yaml  | 23 +++++++++++++++++++++++
 2 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 docker/docker-compose.yaml

diff --git a/cc-chatbot/docker/README.md b/cc-chatbot/docker/README.md
index b75ca18..90c3cb0 100644
--- a/cc-chatbot/docker/README.md
+++ b/cc-chatbot/docker/README.md
@@ -13,4 +13,36 @@ Copy `.env.sample` to `.env` and modify as needed to customize configuration.
 Alternatively, set environment variables directly in your shell before running the `docker-compose up` command. For example:
 ```bash
 OLLAMA_URL=http://host.docker.internal:11434 PREFETCH_MODEL=0 INFERENCE_MODEL=llama3.2:3B  docker-compose up --build
-```
\ No newline at end of file
+```
+
+
+## Populating a vector store
+
+Spin up a llama-stack instance where you want the vector store to live:
+
+```bash
+uv run --with llama-stack==0.4.1 llama stack run starter
+```
+
+Wait until you see the `Uvicorn running on <url>` message. Then, test everything works:
+
+```bash
+# Set environment variables        
+export OPENAI_BASE_URL=http://localhost:8321/v1
+export OPENAI_API_KEY=none # Llama Stack doesn't require a real key
+export OPENAI_EMBEDDING_MODEL=sentence-transformers/nomic-ai/nomic-embed-text-v1.5
+export OPENAI_EMBEDDING_DIMENSIONS=768
+
+# Set your Athena credentials
+export ATHENA_OUTPUT_BUCKET=s3://cc-vec-damian-01/test-results
+export AWS_PROFILE=cc-volunteers
+export AWS_DEFAULT_REGION=us-east-1
+
+# Use cc-vec with local models
+uv run cc-vec index --url-patterns "%commoncrawl.org" --limit 10
+```
+
+If it succeeds run again with `--limit 1000` to index everything 
+
+
+> Note: if running debug locally, llama stack needs additional pip packages `sentence-transformers einops`
diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml
new file mode 100644
index 0000000..d93e2b1
--- /dev/null
+++ b/docker/docker-compose.yaml
@@ -0,0 +1,23 @@
+services:
+  cc-vec-bot:
+    build:
+      context: ..
+      dockerfile: docker/Dockerfile.cc-vec-bot
+      args:
+        PREFETCH_MODEL: ${PREFETCH_MODEL:-1}
+        INFERENCE_MODEL: ${INFERENCE_MODEL:-tinyllama}
+    image: cc-vec-bot
+    environment:
+      LLAMA_STACK_PORT: ${LLAMA_STACK_PORT:-5001}
+      CHATBOT_PORT: ${CHATBOT_PORT:-8008}
+      OLLAMA_URL: ${OLLAMA_URL:-}
+      OLLAMA_STREAMING: ${OLLAMA_STREAMING:-1}
+      INFERENCE_MODEL: ${INFERENCE_MODEL:-tinyllama}
+    ports:
+      - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
+      - "${CHATBOT_PORT:-8008}:${CHATBOT_PORT:-8008}"
+      - "11434:11434"
+    volumes:
+      - ~/.llama:/root/.llama
+
+

From 8b0201eda31e54903bc8cb7153f54c423f0f3052 Mon Sep 17 00:00:00 2001
From: Damian Stewart <ot@damianstewart.com>
Date: Mon, 19 Jan 2026 17:02:13 +0100
Subject: [PATCH 11/11] wip: chatbot instructions iteration

---
 cc-chatbot/chatbot/README.md            | 33 +++++++++++++++++++++++++
 cc-chatbot/docker/Dockerfile.cc-vec-bot |  2 +-
 docker/docker-compose.yaml              | 23 -----------------
 3 files changed, 34 insertions(+), 24 deletions(-)
 delete mode 100644 docker/docker-compose.yaml

diff --git a/cc-chatbot/chatbot/README.md b/cc-chatbot/chatbot/README.md
index 3b06cc1..b0313ab 100644
--- a/cc-chatbot/chatbot/README.md
+++ b/cc-chatbot/chatbot/README.md
@@ -20,3 +20,36 @@ OLLAMA_URL=http://localhost:11434 # /api/generate is appended
 INFERENCE_MODEL=tinyllama
 STREAMING=1
 ````
+
+## Manual / Development
+
+Make sure ollama is running, then open 2 terminal windows.
+
+In the first, launch llama stack configured to talk to ollama:
+
+```bash
+OLLAMA_URL=http://localhost:11434/v1 uv run --with llama-stack==0.4.1 llama stack run starte
+```
+
+In the second, launch the cc chatbot:
+
+```bash
+cd cc-chatbot/chatbot
+OLLAMA_URL=http://localhost:8321 uvicorn api:app --reload
+```
+
+## Building the vector store
+
+Make sure ollama is running, then open 2 terminal windows.
+
+In the first, launch llama stack configured to talk to ollama:
+
+```bash
+OLLAMA_URL=http://localhost:11434/v1 uv run --with llama-stack==0.4.1 llama stack run starte
+```
+
+In the second, run cc-vec:
+
+```bash
+uv run cc-vec index --url-patterns "%commoncrawl.org" --limit 1000 --vector-store-name 'commoncrawl-org-v1' --chunk-size 800 --overlap 400
+```
diff --git a/cc-chatbot/docker/Dockerfile.cc-vec-bot b/cc-chatbot/docker/Dockerfile.cc-vec-bot
index 0f0f81a..58d365f 100644
--- a/cc-chatbot/docker/Dockerfile.cc-vec-bot
+++ b/cc-chatbot/docker/Dockerfile.cc-vec-bot
@@ -20,7 +20,7 @@ ENV PATH="/usr/local/bin:${PATH}"
 #
 # Build examples:
 #   docker build --build-arg PREFETCH_MODEL=1 -t cc-vec-bot .           # bake tinyllama
-#   docker build --build-arg PREFETCH_MODEL=1 --build-arg INFERENCE_MODEL=llama3.2:3b -t cc-vec-bot .
+#   docker build --build-arg PREFETCH_MODEL=1 --build-arg INFERENCE_MODEL=llama3.2:3b -t cc-vec-bot . # bake llama3.2:3b
 #   docker build -t cc-vec-bot .                                         # no prefetch (default)
 # ---------------------------------------------------------------------------
 ARG PREFETCH_MODEL=0
diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml
deleted file mode 100644
index d93e2b1..0000000
--- a/docker/docker-compose.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-services:
-  cc-vec-bot:
-    build:
-      context: ..
-      dockerfile: docker/Dockerfile.cc-vec-bot
-      args:
-        PREFETCH_MODEL: ${PREFETCH_MODEL:-1}
-        INFERENCE_MODEL: ${INFERENCE_MODEL:-tinyllama}
-    image: cc-vec-bot
-    environment:
-      LLAMA_STACK_PORT: ${LLAMA_STACK_PORT:-5001}
-      CHATBOT_PORT: ${CHATBOT_PORT:-8008}
-      OLLAMA_URL: ${OLLAMA_URL:-}
-      OLLAMA_STREAMING: ${OLLAMA_STREAMING:-1}
-      INFERENCE_MODEL: ${INFERENCE_MODEL:-tinyllama}
-    ports:
-      - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
-      - "${CHATBOT_PORT:-8008}:${CHATBOT_PORT:-8008}"
-      - "11434:11434"
-    volumes:
-      - ~/.llama:/root/.llama
-
-