diff --git a/README.md b/README.md
index ad49916..92e6956 100644
--- a/README.md
+++ b/README.md
@@ -140,6 +140,21 @@ export AWS_SECRET_ACCESS_KEY=your-secret
uv run cc-vec index --url-patterns "%.edu" --limit 10
```
+damian:
+# Set environment variables
+export OPENAI_BASE_URL=http://localhost:8321/v1
+export OPENAI_API_KEY=none # Llama Stack doesn't require a real key
+export OPENAI_EMBEDDING_MODEL=ollama/nomic-embed-text:latest
+export OPENAI_EMBEDDING_DIMENSIONS=768
+
+# Set your Athena credentials
+export ATHENA_OUTPUT_BUCKET=s3://cc-vec-damian-01/test-results
+export AWS_PROFILE=cc-volunteers
+export AWS_DEFAULT_REGION=us-east-1
+
+# Use cc-vec with local models
+uv run cc-vec index --url-patterns "%.edu" --limit 10
+
**Documentation:**
- [Llama Stack Docs](https://llamastack.github.io/)
- [Llama Stack GitHub](https://github.com/meta-llama/llama-stack)
diff --git a/cc-chatbot/chatbot/README.md b/cc-chatbot/chatbot/README.md
new file mode 100644
index 0000000..b0313ab
--- /dev/null
+++ b/cc-chatbot/chatbot/README.md
@@ -0,0 +1,55 @@
+# Minimal CC Chatbot Frontend
+
+## Quickstart
+
+Launch ollama server, then run:
+
+```bash
+pip install -r requirements.txt
+uvicorn api:app --reload
+```
+
+Click on the link that uvicorn prints in your terminal to open the frontend.
+
+## Configuration
+
+Configuration is done via environment variables in `api.py`. Defaults:
+
+```bash
+OLLAMA_URL=http://localhost:11434 # /api/generate is appended
+INFERENCE_MODEL=tinyllama
+STREAMING=1
+````
+
+## Manual / Development
+
+Make sure ollama is running, then open 2 terminal windows.
+
+In the first, launch llama stack configured to talk to ollama:
+
+```bash
+OLLAMA_URL=http://localhost:11434/v1 uv run --with llama-stack==0.4.1 llama stack run starte
+```
+
+In the second, launch the cc chatbot:
+
+```bash
+cd cc-chatbot/chatbot
+OLLAMA_URL=http://localhost:8321 uvicorn api:app --reload
+```
+
+## Building the vector store
+
+Make sure ollama is running, then open 2 terminal windows.
+
+In the first, launch llama stack configured to talk to ollama:
+
+```bash
+OLLAMA_URL=http://localhost:11434/v1 uv run --with llama-stack==0.4.1 llama stack run starte
+```
+
+In the second, run cc-vec:
+
+```bash
+uv run cc-vec index --url-patterns "%commoncrawl.org" --limit 1000 --vector-store-name 'commoncrawl-org-v1' --chunk-size 800 --overlap 400
+```
diff --git a/cc-chatbot/chatbot/api.py b/cc-chatbot/chatbot/api.py
new file mode 100644
index 0000000..0091cbc
--- /dev/null
+++ b/cc-chatbot/chatbot/api.py
@@ -0,0 +1,49 @@
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse, StreamingResponse, FileResponse
+import httpx
+import os
+import json
+
+app = FastAPI()
+OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434") + "/api/generate"
+INFERENCE_MODEL = os.environ.get("INFERENCE_MODEL", "tinyllama")
+STREAMING = os.environ.get("OLLAMA_STREAMING", "1") != "0"
+
+@app.get("/")
+async def serve_index():
+ print("serving index.html")
+ return FileResponse("index.html")
+
+@app.post("/api/chat")
+async def chat(request: Request):
+ data = await request.json()
+ prompt = data.get("message", "")
+ payload = {"model": INFERENCE_MODEL, "prompt": prompt, "stream": STREAMING}
+
+ if STREAMING:
+ # Stream JSON lines to the frontend as SSE
+ async def event_stream():
+ async with httpx.AsyncClient(timeout=None) as client:
+ async with client.stream("POST", OLLAMA_URL, json=payload) as r:
+ r.raise_for_status()
+ async for line in r.aiter_lines():
+ line = line.strip()
+ if not line:
+ continue
+ try:
+ obj = json.loads(line)
+ # Only send the 'response' field
+ response_text = obj.get('response', '')
+ if response_text:
+ yield f"data: {json.dumps({'response': response_text})}\n\n"
+ except Exception as e:
+ print(f"Error parsing line: {e}")
+ continue
+ return StreamingResponse(event_stream(), media_type="text/event-stream")
+ else:
+ # Non-streaming mode
+ async with httpx.AsyncClient(timeout=None) as client:
+ r = await client.post(OLLAMA_URL, json=payload)
+ r.raise_for_status()
+ result = r.json()
+ return JSONResponse({"response": result.get("response", "")})
diff --git a/cc-chatbot/chatbot/index.html b/cc-chatbot/chatbot/index.html
new file mode 100644
index 0000000..a3076df
--- /dev/null
+++ b/cc-chatbot/chatbot/index.html
@@ -0,0 +1,75 @@
+
+
+
+ Llama Chatbot
+
+
+
+ Creative Commons Chatbot
+ powered by Llama Stack
+
+
+
+
+
+
diff --git a/cc-chatbot/chatbot/requirements.txt b/cc-chatbot/chatbot/requirements.txt
new file mode 100644
index 0000000..c937678
--- /dev/null
+++ b/cc-chatbot/chatbot/requirements.txt
@@ -0,0 +1,3 @@
+fastapi
+httpx
+uvicorn
diff --git a/cc-chatbot/docker/.env.sample b/cc-chatbot/docker/.env.sample
new file mode 100644
index 0000000..76d48a2
--- /dev/null
+++ b/cc-chatbot/docker/.env.sample
@@ -0,0 +1,31 @@
+# Docker Compose Configuration for cc-vec-bot
+# Copy this file to .env and customize values as needed
+
+# Run: docker compose up --build
+
+# Inference model to use (tinyllama, llama2, llama3.2:3b, etc.)
+# recommended: tinyllama for local testing (700MB), llama3.2:3B for production
+INFERENCE_MODEL=tinyllama
+
+# LLM model files are large.
+# Set to 1 to pre-fetch the model at build time (increases image size and build time)
+# Set to 0 to fetch when the image is run (smaller, faster build but redundant runtime fetch of large models)
+PREFETCH_MODEL=1
+
+# Ports
+LLAMA_STACK_PORT=5001
+CHATBOT_PORT=8008
+
+# Streaming mode for chatbot responses (0=off, 1=on)
+OLLAMA_STREAMING=1
+
+# Ollama URL (optional - defaults to local Ollama on port 11434)
+# Leave empty to use built-in Ollama, or set to external instance
+# Examples:
+# OLLAMA_URL=http://host.docker.internal:11434
+# OLLAMA_URL=http://192.168.1.100:11435
+# OLLAMA_URL=
+
+# ChromaDB URL (optional - for persistent vector storage)
+# CHROMADB_URL=http://localhost:8000
+
diff --git a/cc-chatbot/docker/.gitignore b/cc-chatbot/docker/.gitignore
new file mode 100644
index 0000000..4c49bd7
--- /dev/null
+++ b/cc-chatbot/docker/.gitignore
@@ -0,0 +1 @@
+.env
diff --git a/cc-chatbot/docker/Dockerfile.cc-vec-bot b/cc-chatbot/docker/Dockerfile.cc-vec-bot
new file mode 100644
index 0000000..58d365f
--- /dev/null
+++ b/cc-chatbot/docker/Dockerfile.cc-vec-bot
@@ -0,0 +1,103 @@
+FROM llamastack/distribution-starter:0.4.1
+LABEL maintainer="damian@commoncrawl.org"
+
+USER root
+
+# Install minimal dependencies required by the Ollama install script
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends curl ca-certificates gnupg zstd python3-pip \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install Ollama (for local inference when OLLAMA_URL is not set)
+RUN curl -fsSL https://ollama.ai/install.sh | sh
+
+ENV PATH="/usr/local/bin:${PATH}"
+
+# ---------------------------------------------------------------------------
+# Build-time model pre-fetch (optional)
+# Set PREFETCH_MODEL=1 to bake the model into the image at build time.
+# This makes the image larger but faster to start.
+#
+# Build examples:
+# docker build --build-arg PREFETCH_MODEL=1 -t cc-vec-bot . # bake tinyllama
+# docker build --build-arg PREFETCH_MODEL=1 --build-arg INFERENCE_MODEL=llama3.2:3b -t cc-vec-bot . # bake llama3.2:3b
+# docker build -t cc-vec-bot . # no prefetch (default)
+# ---------------------------------------------------------------------------
+ARG PREFETCH_MODEL=0
+ARG INFERENCE_MODEL=tinyllama
+
+# Pre-fetch model at build time if PREFETCH_MODEL=1
+# Requires starting ollama serve temporarily during build
+RUN if [ "$PREFETCH_MODEL" = "1" ]; then \
+ echo "Pre-fetching model: ${INFERENCE_MODEL}"; \
+ ollama serve & \
+ OLLAMA_PID=$!; \
+ sleep 5; \
+ for i in 1 2 3 4 5 6 7 8 9 10; do \
+ curl -s http://localhost:11434/api/tags >/dev/null 2>&1 && break; \
+ sleep 2; \
+ done; \
+ ollama pull "${INFERENCE_MODEL}"; \
+ kill $OLLAMA_PID 2>/dev/null || true; \
+ echo "Model ${INFERENCE_MODEL} pre-fetched successfully"; \
+ else \
+ echo "Skipping model pre-fetch (PREFETCH_MODEL=0)"; \
+ fi
+
+# ---------------------------------------------------------------------------
+# Compatibility with deprecated llamastack/distribution-ollama
+# Usage:
+# export LLAMA_STACK_PORT=5001
+# docker run -it \
+# -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+# -v ~/.llama:/root/.llama \
+# cc-vec-bot \
+# --port $LLAMA_STACK_PORT \
+# --env INFERENCE_MODEL=tinyllama \
+# --env OLLAMA_URL=http://host.docker.internal:11434
+#
+# Or with built-in Ollama (no external Ollama needed):
+# docker run -it \
+# -p 5001:5001 -p 11434:11434 \
+# -v ~/.llama:/root/.llama \
+# cc-vec-bot \
+# --port 5001 \
+# --env INFERENCE_MODEL=tinyllama
+# ---------------------------------------------------------------------------
+
+# Default model (inherits from build ARG, can be overridden at runtime)
+# tinyllama ~637MB (smallest practical LLM)
+# all-minilm ~45MB (embeddings only)
+# llama3.2:3b ~2GB (production)
+ENV INFERENCE_MODEL=${INFERENCE_MODEL}
+
+# Default ports
+ENV LLAMA_STACK_PORT=5001
+ENV CHATBOT_PORT=8008
+
+# Streaming mode (0=off, 1=on)
+ENV OLLAMA_STREAMING=1
+
+# ---------------------------------------------------------------------------
+# Install chatbot-frontend
+# ---------------------------------------------------------------------------
+
+WORKDIR /opt/chatbot-frontend
+COPY chatbot .
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# ---------------------------------------------------------------------------
+# Copy and setup entrypoint script
+# ---------------------------------------------------------------------------
+
+COPY docker/entrypoint.sh /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+
+# Volume for llama-stack config and model cache
+VOLUME ["/root/.llama"]
+
+# Expose both llama-stack and ollama ports
+EXPOSE 5001 11434
+
+ENTRYPOINT ["/entrypoint.sh"]
+CMD []
diff --git a/cc-chatbot/docker/README.md b/cc-chatbot/docker/README.md
new file mode 100644
index 0000000..90c3cb0
--- /dev/null
+++ b/cc-chatbot/docker/README.md
@@ -0,0 +1,48 @@
+# CC chatbot docker setup
+
+## Quickstart
+
+To run with default configuration (internal ollama, tinyllama baked into image, chatbot on http://localhost:8008), run:
+
+`docker-compose up --build`
+
+## Configuration
+
+Copy `.env.sample` to `.env` and modify as needed to customize configuration.
+
+Alternatively, set environment variables directly in your shell before running the `docker-compose up` command. For example:
+```bash
+OLLAMA_URL=http://host.docker.internal:11434 PREFETCH_MODEL=0 INFERENCE_MODEL=llama3.2:3B docker-compose up --build
+```
+
+
+## Populating a vector store
+
+Spin up a llama-stack instance where you want the vector store to live:
+
+```bash
+uv run --with llama-stack==0.4.1 llama stack run starter
+```
+
+Wait until you see the `Uvicorn running on ` message. Then, test everything works:
+
+```bash
+# Set environment variables
+export OPENAI_BASE_URL=http://localhost:8321/v1
+export OPENAI_API_KEY=none # Llama Stack doesn't require a real key
+export OPENAI_EMBEDDING_MODEL=sentence-transformers/nomic-ai/nomic-embed-text-v1.5
+export OPENAI_EMBEDDING_DIMENSIONS=768
+
+# Set your Athena credentials
+export ATHENA_OUTPUT_BUCKET=s3://cc-vec-damian-01/test-results
+export AWS_PROFILE=cc-volunteers
+export AWS_DEFAULT_REGION=us-east-1
+
+# Use cc-vec with local models
+uv run cc-vec index --url-patterns "%commoncrawl.org" --limit 10
+```
+
+If it succeeds run again with `--limit 1000` to index everything
+
+
+> Note: if running debug locally, llama stack needs additional pip packages `sentence-transformers einops`
diff --git a/cc-chatbot/docker/docker-compose.yaml b/cc-chatbot/docker/docker-compose.yaml
new file mode 100644
index 0000000..1a121eb
--- /dev/null
+++ b/cc-chatbot/docker/docker-compose.yaml
@@ -0,0 +1,21 @@
+services:
+ cc-vec-bot:
+ build:
+ context: ..
+ dockerfile: docker/Dockerfile.cc-vec-bot
+ args:
+ PREFETCH_MODEL: ${PREFETCH_MODEL:-1}
+ INFERENCE_MODEL: ${INFERENCE_MODEL:-tinyllama}
+ image: cc-vec-bot
+ environment:
+ LLAMA_STACK_PORT: ${LLAMA_STACK_PORT:-5001}
+ CHATBOT_PORT: ${CHATBOT_PORT:-8008}
+ OLLAMA_URL: ${OLLAMA_URL:-}
+ OLLAMA_STREAMING: ${OLLAMA_STREAMING:-1}
+ INFERENCE_MODEL: ${INFERENCE_MODEL:-tinyllama}
+ ports:
+ - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}"
+ - "${CHATBOT_PORT:-8008}:${CHATBOT_PORT:-8008}"
+ volumes:
+ - ~/.llama:/root/.llama
+
diff --git a/cc-chatbot/docker/entrypoint.sh b/cc-chatbot/docker/entrypoint.sh
new file mode 100644
index 0000000..64d9eb9
--- /dev/null
+++ b/cc-chatbot/docker/entrypoint.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+set -e
+
+EXTRA_ARGS=()
+while [[ $# -gt 0 ]]; do
+ case $1 in
+ --port)
+ LLAMA_STACK_PORT="$2"
+ shift 2
+ ;;
+ *)
+ # Unknown option, pass through
+ EXTRA_ARGS+=("$1")
+ shift
+ ;;
+ esac
+done
+
+echo "=============================================="
+echo "cc-vec-bot (llama-stack + ollama)"
+echo "=============================================="
+echo "LLAMA_STACK_PORT: ${LLAMA_STACK_PORT:-5001}"
+echo "INFERENCE_MODEL: ${INFERENCE_MODEL:-tinyllama}"
+echo "OLLAMA_URL: ${OLLAMA_URL:-}"
+echo "CHATBOT_PORT: ${CHATBOT_PORT:-8008}"
+echo "=============================================="
+
+# Determine Ollama URL
+if [ -z "$OLLAMA_URL" ]; then
+ # No external Ollama URL provided - start local Ollama
+ echo "Starting local Ollama server..."
+ ollama serve &
+ OLLAMA_PID=$!
+ OLLAMA_URL="http://localhost:11434"
+ export OLLAMA_URL
+
+ # Wait for Ollama to be ready
+ echo "Waiting for Ollama to start..."
+ for i in {1..30}; do
+ if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then
+ echo "Ollama is ready"
+ break
+ fi
+ if [ $i -eq 30 ]; then
+ echo "ERROR: Ollama failed to start"
+ exit 1
+ fi
+ sleep 1
+ done
+
+ # Pull model if specified and PREFETCH_MODEL wasn't set at build time
+ if [ -n "$INFERENCE_MODEL" ]; then
+ echo "Checking if model needs to be pulled: $INFERENCE_MODEL"
+ if ! ollama list | grep -q "^${INFERENCE_MODEL}"; then
+ echo "Pulling model: $INFERENCE_MODEL"
+ ollama pull "$INFERENCE_MODEL"
+ else
+ echo "Model $INFERENCE_MODEL already available"
+ fi
+ fi
+else
+ # External Ollama URL provided - verify connectivity
+ echo "Using external Ollama at: $OLLAMA_URL"
+ for i in {1..10}; do
+ if curl -s "${OLLAMA_URL}/api/tags" >/dev/null 2>&1; then
+ echo "External Ollama is reachable"
+ break
+ fi
+ if [ $i -eq 10 ]; then
+ echo "WARNING: Cannot reach external Ollama at $OLLAMA_URL"
+ fi
+ sleep 1
+ done
+fi
+
+# Start the chatbot in the background
+echo "Starting chatbot-frontend on port ${CHATBOT_PORT}..."
+(cd /opt/chatbot-frontend && uvicorn api:app --host 0.0.0.0 --port "${CHATBOT_PORT}" &)
+SUCCESS=$?
+if [ $SUCCESS -ne 0 ]; then
+ cat /var/log/chatbot.log
+ echo "ERROR: Failed to start chatbot-frontend"
+ exit 1
+fi
+CHATBOT_PID=$!
+echo "Chatbot-frontend started with PID ${CHATBOT_PID}"
+
+# Give chatbot a moment to start
+sleep 2
+
+# Start llama-stack server
+# The distribution-starter base image includes llama-stack
+echo "Starting llama-stack on port ${LLAMA_STACK_PORT}..."
+export OLLAMA_URL="${OLLAMA_URL}"
+export INFERENCE_MODEL="${INFERENCE_MODEL}"
+exec llama stack run starter \
+ --port "${LLAMA_STACK_PORT}" \
+ "${EXTRA_ARGS[@]}"
+