commoncrawl · damian0815 · Jan 15, 2026 · Jan 15, 2026 · Jan 15, 2026 · Jan 15, 2026
diff --git a/README.md b/README.md
@@ -140,6 +140,21 @@ export AWS_SECRET_ACCESS_KEY=your-secret
 uv run cc-vec index --url-patterns "%.edu" --limit 10
 ```
 
+damian: 
+# Set environment variables        
+export OPENAI_BASE_URL=http://localhost:8321/v1
+export OPENAI_API_KEY=none # Llama Stack doesn't require a real key
+export OPENAI_EMBEDDING_MODEL=ollama/nomic-embed-text:latest
+export OPENAI_EMBEDDING_DIMENSIONS=768
+
+# Set your Athena credentials
+export ATHENA_OUTPUT_BUCKET=s3://cc-vec-damian-01/test-results
+export AWS_PROFILE=cc-volunteers
+export AWS_DEFAULT_REGION=us-east-1
+
+# Use cc-vec with local models
+uv run cc-vec index --url-patterns "%.edu" --limit 10
+
 **Documentation:**
 - [Llama Stack Docs](https://llamastack.github.io/)
 - [Llama Stack GitHub](https://github.com/meta-llama/llama-stack)

diff --git a/cc-chatbot/chatbot/README.md b/cc-chatbot/chatbot/README.md
@@ -0,0 +1,55 @@
+# Minimal CC Chatbot Frontend
+
+## Quickstart
+
+Launch ollama server, then run:
+
+```bash
+pip install -r requirements.txt
+uvicorn api:app --reload
+```
+
+Click on the link that uvicorn prints in your terminal to open the frontend.
+
+## Configuration
+
+Configuration is done via environment variables in `api.py`. Defaults:
+
+```bash
+OLLAMA_URL=http://localhost:11434 # /api/generate is appended
+INFERENCE_MODEL=tinyllama
+STREAMING=1
+````
+
+## Manual / Development
+
+Make sure ollama is running, then open 2 terminal windows.
+
+In the first, launch llama stack configured to talk to ollama:
+
+```bash
+OLLAMA_URL=http://localhost:11434/v1 uv run --with llama-stack==0.4.1 llama stack run starte
+```
+
+In the second, launch the cc chatbot:
+
+```bash
+cd cc-chatbot/chatbot
+OLLAMA_URL=http://localhost:8321 uvicorn api:app --reload
+```
+
+## Building the vector store
+
+Make sure ollama is running, then open 2 terminal windows.
+
+In the first, launch llama stack configured to talk to ollama:
+
+```bash
+OLLAMA_URL=http://localhost:11434/v1 uv run --with llama-stack==0.4.1 llama stack run starte
+```
+
+In the second, run cc-vec:
+
+```bash
+uv run cc-vec index --url-patterns "%commoncrawl.org" --limit 1000 --vector-store-name 'commoncrawl-org-v1' --chunk-size 800 --overlap 400
+```
diff --git a/cc-chatbot/chatbot/api.py b/cc-chatbot/chatbot/api.py
@@ -0,0 +1,49 @@
+from fastapi import FastAPI, Request
+from fastapi.responses import JSONResponse, StreamingResponse, FileResponse
+import httpx
+import os
+import json
+
+app = FastAPI()
+OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434") + "/api/generate"
+INFERENCE_MODEL = os.environ.get("INFERENCE_MODEL", "tinyllama")
+STREAMING = os.environ.get("OLLAMA_STREAMING", "1") != "0"
+
+@app.get("/")
+async def serve_index():
+    print("serving index.html")
+    return FileResponse("index.html")
+
+@app.post("/api/chat")
+async def chat(request: Request):
+    data = await request.json()
+    prompt = data.get("message", "")
+    payload = {"model": INFERENCE_MODEL, "prompt": prompt, "stream": STREAMING}
+
+    if STREAMING:
+        # Stream JSON lines to the frontend as SSE
+        async def event_stream():
+            async with httpx.AsyncClient(timeout=None) as client:
+                async with client.stream("POST", OLLAMA_URL, json=payload) as r:
+                    r.raise_for_status()
+                    async for line in r.aiter_lines():
+                        line = line.strip()
+                        if not line:
+                            continue
+                        try:
+                            obj = json.loads(line)
+                            # Only send the 'response' field
+                            response_text = obj.get('response', '')
+                            if response_text:
+                                yield f"data: {json.dumps({'response': response_text})}\n\n"
+                        except Exception as e:
+                            print(f"Error parsing line: {e}")
+                            continue
+        return StreamingResponse(event_stream(), media_type="text/event-stream")
+    else:
+        # Non-streaming mode
+        async with httpx.AsyncClient(timeout=None) as client:
+            r = await client.post(OLLAMA_URL, json=payload)
+            r.raise_for_status()
+            result = r.json()
+            return JSONResponse({"response": result.get("response", "")})
diff --git a/cc-chatbot/chatbot/index.html b/cc-chatbot/chatbot/index.html
@@ -0,0 +1,75 @@
+<!DOCTYPE html>
+<html>
+<head>
+  <title>Llama Chatbot</title>
+  <style>
+    body { font-family: sans-serif; max-width: 600px; margin: 2em auto; }
+    #chat { width: 100%; height: 300px; }
+    #input { width: 100%; }
+  </style>
+</head>
+<body>
+  <h2>Creative Commons Chatbot</h2>
+  <p>powered by Llama Stack</p>
+  <textarea id="chat" readonly></textarea><br>
+  <input id="input" autocomplete="off" placeholder="Type your message..." />
+  <button onclick="send()">Send</button>
+  <script>
+    async function send() {
+      const input = document.getElementById('input');
+      const chat = document.getElementById('chat');
+      const msg = input.value;
+      if (!msg) return;
+      chat.value += "You: " + msg + "\n";
+      input.value = "";
+
+      const res = await fetch('/api/chat', {
+        method: 'POST',
+        headers: {'Content-Type': 'application/json'},
+        body: JSON.stringify({message: msg})
+      });
+
+      // Check if response is streaming (SSE) or regular JSON
+      const contentType = res.headers.get('content-type');
+      if (contentType && contentType.includes('text/event-stream')) {
+        // Handle streaming response
+        chat.value += "Bot: ";
+        const reader = res.body.getReader();
+        const decoder = new TextDecoder();
+        let buffer = '';
+
+        while (true) {
+          const {done, value} = await reader.read();
+          if (done) break;
+
+          buffer += decoder.decode(value, {stream: true});
+          const lines = buffer.split('\n');
+          buffer = lines.pop(); // Keep incomplete line in buffer
+
+          for (const line of lines) {
+            if (line.startsWith('data: ')) {
+              try {
+                const data = JSON.parse(line.slice(6));
+                chat.value += data.response || '';
+                chat.scrollTop = chat.scrollHeight;
+              } catch (e) {
+                // Skip malformed JSON
+              }
+            }
+          }
+        }
+        chat.value += "\n";
+      } else {
+        // Handle non-streaming JSON response
+        const data = await res.json();
+        chat.value += "Bot: " + data.response + "\n";
+      }
+
+      chat.scrollTop = chat.scrollHeight;
+    }
+    document.getElementById('input').addEventListener('keydown', e => {
+      if (e.key === 'Enter') send();
+    });
+  </script>
+</body>
+</html>
diff --git a/cc-chatbot/chatbot/requirements.txt b/cc-chatbot/chatbot/requirements.txt
@@ -0,0 +1,3 @@
+fastapi
+httpx
+uvicorn
diff --git a/cc-chatbot/docker/.env.sample b/cc-chatbot/docker/.env.sample
@@ -0,0 +1,31 @@
+# Docker Compose Configuration for cc-vec-bot
+# Copy this file to .env and customize values as needed
+
+# Run: docker compose up --build
+
+# Inference model to use (tinyllama, llama2, llama3.2:3b, etc.)
+# recommended: tinyllama for local testing (700MB), llama3.2:3B for production
+INFERENCE_MODEL=tinyllama
+
+# LLM model files are large.
+# Set to 1 to pre-fetch the model at build time (increases image size and build time)
+# Set to 0 to fetch when the image is run (smaller, faster build but redundant runtime fetch of large models)
+PREFETCH_MODEL=1
+
+# Ports
+LLAMA_STACK_PORT=5001
+CHATBOT_PORT=8008
+
+# Streaming mode for chatbot responses (0=off, 1=on)
+OLLAMA_STREAMING=1
+
+# Ollama URL (optional - defaults to local Ollama on port 11434)
+# Leave empty to use built-in Ollama, or set to external instance
+# Examples:
+#   OLLAMA_URL=http://host.docker.internal:11434
+#   OLLAMA_URL=http://192.168.1.100:11435
+# OLLAMA_URL=
+
+# ChromaDB URL (optional - for persistent vector storage)
+# CHROMADB_URL=http://localhost:8000
+
diff --git a/cc-chatbot/docker/.gitignore b/cc-chatbot/docker/.gitignore
@@ -0,0 +1 @@
+.env
diff --git a/cc-chatbot/docker/Dockerfile.cc-vec-bot b/cc-chatbot/docker/Dockerfile.cc-vec-bot
@@ -0,0 +1,103 @@
+FROM llamastack/distribution-starter:0.4.1
+LABEL maintainer="damian@commoncrawl.org"
+
+USER root
+
+# Install minimal dependencies required by the Ollama install script
+RUN apt-get update \
+    && apt-get install -y --no-install-recommends curl ca-certificates gnupg zstd python3-pip \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Ollama (for local inference when OLLAMA_URL is not set)
+RUN curl -fsSL https://ollama.ai/install.sh | sh
+
+ENV PATH="/usr/local/bin:${PATH}"
+
+# ---------------------------------------------------------------------------
+# Build-time model pre-fetch (optional)
+# Set PREFETCH_MODEL=1 to bake the model into the image at build time.
+# This makes the image larger but faster to start.
+#
+# Build examples:
+#   docker build --build-arg PREFETCH_MODEL=1 -t cc-vec-bot .           # bake tinyllama
+#   docker build --build-arg PREFETCH_MODEL=1 --build-arg INFERENCE_MODEL=llama3.2:3b -t cc-vec-bot . # bake llama3.2:3b
+#   docker build -t cc-vec-bot .                                         # no prefetch (default)
+# ---------------------------------------------------------------------------
+ARG PREFETCH_MODEL=0
+ARG INFERENCE_MODEL=tinyllama
+
+# Pre-fetch model at build time if PREFETCH_MODEL=1
+# Requires starting ollama serve temporarily during build
+RUN if [ "$PREFETCH_MODEL" = "1" ]; then \
+        echo "Pre-fetching model: ${INFERENCE_MODEL}"; \
+        ollama serve & \
+        OLLAMA_PID=$!; \
+        sleep 5; \
+        for i in 1 2 3 4 5 6 7 8 9 10; do \
+            curl -s http://localhost:11434/api/tags >/dev/null 2>&1 && break; \
+            sleep 2; \
+        done; \
+        ollama pull "${INFERENCE_MODEL}"; \
+        kill $OLLAMA_PID 2>/dev/null || true; \
+        echo "Model ${INFERENCE_MODEL} pre-fetched successfully"; \
+    else \
+        echo "Skipping model pre-fetch (PREFETCH_MODEL=0)"; \
+    fi
+
+# ---------------------------------------------------------------------------
+# Compatibility with deprecated llamastack/distribution-ollama
+# Usage:
+#   export LLAMA_STACK_PORT=5001
+#   docker run -it \
+#     -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \
+#     -v ~/.llama:/root/.llama \
+#     cc-vec-bot \
+#     --port $LLAMA_STACK_PORT \
+#     --env INFERENCE_MODEL=tinyllama \
+#     --env OLLAMA_URL=http://host.docker.internal:11434
+#
+# Or with built-in Ollama (no external Ollama needed):
+#   docker run -it \
+#     -p 5001:5001 -p 11434:11434 \
+#     -v ~/.llama:/root/.llama \
+#     cc-vec-bot \
+#     --port 5001 \
+#     --env INFERENCE_MODEL=tinyllama
+# ---------------------------------------------------------------------------
+
+# Default model (inherits from build ARG, can be overridden at runtime)
+# tinyllama ~637MB (smallest practical LLM)
+# all-minilm ~45MB (embeddings only)
+# llama3.2:3b ~2GB (production)
+ENV INFERENCE_MODEL=${INFERENCE_MODEL}
+
+# Default ports
+ENV LLAMA_STACK_PORT=5001
+ENV CHATBOT_PORT=8008
+
+# Streaming mode (0=off, 1=on)
+ENV OLLAMA_STREAMING=1
+
+# ---------------------------------------------------------------------------
+# Install chatbot-frontend
+# ---------------------------------------------------------------------------
+
+WORKDIR /opt/chatbot-frontend
+COPY chatbot .
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+# ---------------------------------------------------------------------------
+# Copy and setup entrypoint script
+# ---------------------------------------------------------------------------
+
+COPY docker/entrypoint.sh /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+
+# Volume for llama-stack config and model cache
+VOLUME ["/root/.llama"]
+
+# Expose both llama-stack and ollama ports
+EXPOSE 5001 11434
+
+ENTRYPOINT ["/entrypoint.sh"]
+CMD []
diff --git a/cc-chatbot/docker/README.md b/cc-chatbot/docker/README.md
@@ -0,0 +1,48 @@
+# CC chatbot docker setup
+
+## Quickstart
+
+To run with default configuration (internal ollama, tinyllama baked into image, chatbot on http://localhost:8008), run:
+
+`docker-compose up --build`
+
+## Configuration
+
+Copy `.env.sample` to `.env` and modify as needed to customize configuration.
+
+Alternatively, set environment variables directly in your shell before running the `docker-compose up` command. For example:
+```bash
+OLLAMA_URL=http://host.docker.internal:11434 PREFETCH_MODEL=0 INFERENCE_MODEL=llama3.2:3B  docker-compose up --build
+```
+
+
+## Populating a vector store
+
+Spin up a llama-stack instance where you want the vector store to live:
+
+```bash
+uv run --with llama-stack==0.4.1 llama stack run starter
+```
+
+Wait until you see the `Uvicorn running on <url>` message. Then, test everything works:
+
+```bash
+# Set environment variables        
+export OPENAI_BASE_URL=http://localhost:8321/v1
+export OPENAI_API_KEY=none # Llama Stack doesn't require a real key
+export OPENAI_EMBEDDING_MODEL=sentence-transformers/nomic-ai/nomic-embed-text-v1.5
+export OPENAI_EMBEDDING_DIMENSIONS=768
+
+# Set your Athena credentials
+export ATHENA_OUTPUT_BUCKET=s3://cc-vec-damian-01/test-results
+export AWS_PROFILE=cc-volunteers
+export AWS_DEFAULT_REGION=us-east-1
+
+# Use cc-vec with local models
+uv run cc-vec index --url-patterns "%commoncrawl.org" --limit 10
+```
+
+If it succeeds run again with `--limit 1000` to index everything 
+
+
+> Note: if running debug locally, llama stack needs additional pip packages `sentence-transformers einops`
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    fastapi
+    httpx
+    uvicorn