diff --git a/README.md b/README.md index ad49916..92e6956 100644 --- a/README.md +++ b/README.md @@ -140,6 +140,21 @@ export AWS_SECRET_ACCESS_KEY=your-secret uv run cc-vec index --url-patterns "%.edu" --limit 10 ``` +damian: +# Set environment variables +export OPENAI_BASE_URL=http://localhost:8321/v1 +export OPENAI_API_KEY=none # Llama Stack doesn't require a real key +export OPENAI_EMBEDDING_MODEL=ollama/nomic-embed-text:latest +export OPENAI_EMBEDDING_DIMENSIONS=768 + +# Set your Athena credentials +export ATHENA_OUTPUT_BUCKET=s3://cc-vec-damian-01/test-results +export AWS_PROFILE=cc-volunteers +export AWS_DEFAULT_REGION=us-east-1 + +# Use cc-vec with local models +uv run cc-vec index --url-patterns "%.edu" --limit 10 + **Documentation:** - [Llama Stack Docs](https://llamastack.github.io/) - [Llama Stack GitHub](https://github.com/meta-llama/llama-stack) diff --git a/cc-chatbot/chatbot/README.md b/cc-chatbot/chatbot/README.md new file mode 100644 index 0000000..b0313ab --- /dev/null +++ b/cc-chatbot/chatbot/README.md @@ -0,0 +1,55 @@ +# Minimal CC Chatbot Frontend + +## Quickstart + +Launch ollama server, then run: + +```bash +pip install -r requirements.txt +uvicorn api:app --reload +``` + +Click on the link that uvicorn prints in your terminal to open the frontend. + +## Configuration + +Configuration is done via environment variables in `api.py`. Defaults: + +```bash +OLLAMA_URL=http://localhost:11434 # /api/generate is appended +INFERENCE_MODEL=tinyllama +STREAMING=1 +```` + +## Manual / Development + +Make sure ollama is running, then open 2 terminal windows. + +In the first, launch llama stack configured to talk to ollama: + +```bash +OLLAMA_URL=http://localhost:11434/v1 uv run --with llama-stack==0.4.1 llama stack run starte +``` + +In the second, launch the cc chatbot: + +```bash +cd cc-chatbot/chatbot +OLLAMA_URL=http://localhost:8321 uvicorn api:app --reload +``` + +## Building the vector store + +Make sure ollama is running, then open 2 terminal windows. + +In the first, launch llama stack configured to talk to ollama: + +```bash +OLLAMA_URL=http://localhost:11434/v1 uv run --with llama-stack==0.4.1 llama stack run starte +``` + +In the second, run cc-vec: + +```bash +uv run cc-vec index --url-patterns "%commoncrawl.org" --limit 1000 --vector-store-name 'commoncrawl-org-v1' --chunk-size 800 --overlap 400 +``` diff --git a/cc-chatbot/chatbot/api.py b/cc-chatbot/chatbot/api.py new file mode 100644 index 0000000..0091cbc --- /dev/null +++ b/cc-chatbot/chatbot/api.py @@ -0,0 +1,49 @@ +from fastapi import FastAPI, Request +from fastapi.responses import JSONResponse, StreamingResponse, FileResponse +import httpx +import os +import json + +app = FastAPI() +OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434") + "/api/generate" +INFERENCE_MODEL = os.environ.get("INFERENCE_MODEL", "tinyllama") +STREAMING = os.environ.get("OLLAMA_STREAMING", "1") != "0" + +@app.get("/") +async def serve_index(): + print("serving index.html") + return FileResponse("index.html") + +@app.post("/api/chat") +async def chat(request: Request): + data = await request.json() + prompt = data.get("message", "") + payload = {"model": INFERENCE_MODEL, "prompt": prompt, "stream": STREAMING} + + if STREAMING: + # Stream JSON lines to the frontend as SSE + async def event_stream(): + async with httpx.AsyncClient(timeout=None) as client: + async with client.stream("POST", OLLAMA_URL, json=payload) as r: + r.raise_for_status() + async for line in r.aiter_lines(): + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + # Only send the 'response' field + response_text = obj.get('response', '') + if response_text: + yield f"data: {json.dumps({'response': response_text})}\n\n" + except Exception as e: + print(f"Error parsing line: {e}") + continue + return StreamingResponse(event_stream(), media_type="text/event-stream") + else: + # Non-streaming mode + async with httpx.AsyncClient(timeout=None) as client: + r = await client.post(OLLAMA_URL, json=payload) + r.raise_for_status() + result = r.json() + return JSONResponse({"response": result.get("response", "")}) diff --git a/cc-chatbot/chatbot/index.html b/cc-chatbot/chatbot/index.html new file mode 100644 index 0000000..a3076df --- /dev/null +++ b/cc-chatbot/chatbot/index.html @@ -0,0 +1,75 @@ + + + + Llama Chatbot + + + +

Creative Commons Chatbot

+

powered by Llama Stack

+
+ + + + + diff --git a/cc-chatbot/chatbot/requirements.txt b/cc-chatbot/chatbot/requirements.txt new file mode 100644 index 0000000..c937678 --- /dev/null +++ b/cc-chatbot/chatbot/requirements.txt @@ -0,0 +1,3 @@ +fastapi +httpx +uvicorn diff --git a/cc-chatbot/docker/.env.sample b/cc-chatbot/docker/.env.sample new file mode 100644 index 0000000..76d48a2 --- /dev/null +++ b/cc-chatbot/docker/.env.sample @@ -0,0 +1,31 @@ +# Docker Compose Configuration for cc-vec-bot +# Copy this file to .env and customize values as needed + +# Run: docker compose up --build + +# Inference model to use (tinyllama, llama2, llama3.2:3b, etc.) +# recommended: tinyllama for local testing (700MB), llama3.2:3B for production +INFERENCE_MODEL=tinyllama + +# LLM model files are large. +# Set to 1 to pre-fetch the model at build time (increases image size and build time) +# Set to 0 to fetch when the image is run (smaller, faster build but redundant runtime fetch of large models) +PREFETCH_MODEL=1 + +# Ports +LLAMA_STACK_PORT=5001 +CHATBOT_PORT=8008 + +# Streaming mode for chatbot responses (0=off, 1=on) +OLLAMA_STREAMING=1 + +# Ollama URL (optional - defaults to local Ollama on port 11434) +# Leave empty to use built-in Ollama, or set to external instance +# Examples: +# OLLAMA_URL=http://host.docker.internal:11434 +# OLLAMA_URL=http://192.168.1.100:11435 +# OLLAMA_URL= + +# ChromaDB URL (optional - for persistent vector storage) +# CHROMADB_URL=http://localhost:8000 + diff --git a/cc-chatbot/docker/.gitignore b/cc-chatbot/docker/.gitignore new file mode 100644 index 0000000..4c49bd7 --- /dev/null +++ b/cc-chatbot/docker/.gitignore @@ -0,0 +1 @@ +.env diff --git a/cc-chatbot/docker/Dockerfile.cc-vec-bot b/cc-chatbot/docker/Dockerfile.cc-vec-bot new file mode 100644 index 0000000..58d365f --- /dev/null +++ b/cc-chatbot/docker/Dockerfile.cc-vec-bot @@ -0,0 +1,103 @@ +FROM llamastack/distribution-starter:0.4.1 +LABEL maintainer="damian@commoncrawl.org" + +USER root + +# Install minimal dependencies required by the Ollama install script +RUN apt-get update \ + && apt-get install -y --no-install-recommends curl ca-certificates gnupg zstd python3-pip \ + && rm -rf /var/lib/apt/lists/* + +# Install Ollama (for local inference when OLLAMA_URL is not set) +RUN curl -fsSL https://ollama.ai/install.sh | sh + +ENV PATH="/usr/local/bin:${PATH}" + +# --------------------------------------------------------------------------- +# Build-time model pre-fetch (optional) +# Set PREFETCH_MODEL=1 to bake the model into the image at build time. +# This makes the image larger but faster to start. +# +# Build examples: +# docker build --build-arg PREFETCH_MODEL=1 -t cc-vec-bot . # bake tinyllama +# docker build --build-arg PREFETCH_MODEL=1 --build-arg INFERENCE_MODEL=llama3.2:3b -t cc-vec-bot . # bake llama3.2:3b +# docker build -t cc-vec-bot . # no prefetch (default) +# --------------------------------------------------------------------------- +ARG PREFETCH_MODEL=0 +ARG INFERENCE_MODEL=tinyllama + +# Pre-fetch model at build time if PREFETCH_MODEL=1 +# Requires starting ollama serve temporarily during build +RUN if [ "$PREFETCH_MODEL" = "1" ]; then \ + echo "Pre-fetching model: ${INFERENCE_MODEL}"; \ + ollama serve & \ + OLLAMA_PID=$!; \ + sleep 5; \ + for i in 1 2 3 4 5 6 7 8 9 10; do \ + curl -s http://localhost:11434/api/tags >/dev/null 2>&1 && break; \ + sleep 2; \ + done; \ + ollama pull "${INFERENCE_MODEL}"; \ + kill $OLLAMA_PID 2>/dev/null || true; \ + echo "Model ${INFERENCE_MODEL} pre-fetched successfully"; \ + else \ + echo "Skipping model pre-fetch (PREFETCH_MODEL=0)"; \ + fi + +# --------------------------------------------------------------------------- +# Compatibility with deprecated llamastack/distribution-ollama +# Usage: +# export LLAMA_STACK_PORT=5001 +# docker run -it \ +# -p $LLAMA_STACK_PORT:$LLAMA_STACK_PORT \ +# -v ~/.llama:/root/.llama \ +# cc-vec-bot \ +# --port $LLAMA_STACK_PORT \ +# --env INFERENCE_MODEL=tinyllama \ +# --env OLLAMA_URL=http://host.docker.internal:11434 +# +# Or with built-in Ollama (no external Ollama needed): +# docker run -it \ +# -p 5001:5001 -p 11434:11434 \ +# -v ~/.llama:/root/.llama \ +# cc-vec-bot \ +# --port 5001 \ +# --env INFERENCE_MODEL=tinyllama +# --------------------------------------------------------------------------- + +# Default model (inherits from build ARG, can be overridden at runtime) +# tinyllama ~637MB (smallest practical LLM) +# all-minilm ~45MB (embeddings only) +# llama3.2:3b ~2GB (production) +ENV INFERENCE_MODEL=${INFERENCE_MODEL} + +# Default ports +ENV LLAMA_STACK_PORT=5001 +ENV CHATBOT_PORT=8008 + +# Streaming mode (0=off, 1=on) +ENV OLLAMA_STREAMING=1 + +# --------------------------------------------------------------------------- +# Install chatbot-frontend +# --------------------------------------------------------------------------- + +WORKDIR /opt/chatbot-frontend +COPY chatbot . +RUN pip3 install --no-cache-dir -r requirements.txt + +# --------------------------------------------------------------------------- +# Copy and setup entrypoint script +# --------------------------------------------------------------------------- + +COPY docker/entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +# Volume for llama-stack config and model cache +VOLUME ["/root/.llama"] + +# Expose both llama-stack and ollama ports +EXPOSE 5001 11434 + +ENTRYPOINT ["/entrypoint.sh"] +CMD [] diff --git a/cc-chatbot/docker/README.md b/cc-chatbot/docker/README.md new file mode 100644 index 0000000..90c3cb0 --- /dev/null +++ b/cc-chatbot/docker/README.md @@ -0,0 +1,48 @@ +# CC chatbot docker setup + +## Quickstart + +To run with default configuration (internal ollama, tinyllama baked into image, chatbot on http://localhost:8008), run: + +`docker-compose up --build` + +## Configuration + +Copy `.env.sample` to `.env` and modify as needed to customize configuration. + +Alternatively, set environment variables directly in your shell before running the `docker-compose up` command. For example: +```bash +OLLAMA_URL=http://host.docker.internal:11434 PREFETCH_MODEL=0 INFERENCE_MODEL=llama3.2:3B docker-compose up --build +``` + + +## Populating a vector store + +Spin up a llama-stack instance where you want the vector store to live: + +```bash +uv run --with llama-stack==0.4.1 llama stack run starter +``` + +Wait until you see the `Uvicorn running on ` message. Then, test everything works: + +```bash +# Set environment variables +export OPENAI_BASE_URL=http://localhost:8321/v1 +export OPENAI_API_KEY=none # Llama Stack doesn't require a real key +export OPENAI_EMBEDDING_MODEL=sentence-transformers/nomic-ai/nomic-embed-text-v1.5 +export OPENAI_EMBEDDING_DIMENSIONS=768 + +# Set your Athena credentials +export ATHENA_OUTPUT_BUCKET=s3://cc-vec-damian-01/test-results +export AWS_PROFILE=cc-volunteers +export AWS_DEFAULT_REGION=us-east-1 + +# Use cc-vec with local models +uv run cc-vec index --url-patterns "%commoncrawl.org" --limit 10 +``` + +If it succeeds run again with `--limit 1000` to index everything + + +> Note: if running debug locally, llama stack needs additional pip packages `sentence-transformers einops` diff --git a/cc-chatbot/docker/docker-compose.yaml b/cc-chatbot/docker/docker-compose.yaml new file mode 100644 index 0000000..1a121eb --- /dev/null +++ b/cc-chatbot/docker/docker-compose.yaml @@ -0,0 +1,21 @@ +services: + cc-vec-bot: + build: + context: .. + dockerfile: docker/Dockerfile.cc-vec-bot + args: + PREFETCH_MODEL: ${PREFETCH_MODEL:-1} + INFERENCE_MODEL: ${INFERENCE_MODEL:-tinyllama} + image: cc-vec-bot + environment: + LLAMA_STACK_PORT: ${LLAMA_STACK_PORT:-5001} + CHATBOT_PORT: ${CHATBOT_PORT:-8008} + OLLAMA_URL: ${OLLAMA_URL:-} + OLLAMA_STREAMING: ${OLLAMA_STREAMING:-1} + INFERENCE_MODEL: ${INFERENCE_MODEL:-tinyllama} + ports: + - "${LLAMA_STACK_PORT:-5001}:${LLAMA_STACK_PORT:-5001}" + - "${CHATBOT_PORT:-8008}:${CHATBOT_PORT:-8008}" + volumes: + - ~/.llama:/root/.llama + diff --git a/cc-chatbot/docker/entrypoint.sh b/cc-chatbot/docker/entrypoint.sh new file mode 100644 index 0000000..64d9eb9 --- /dev/null +++ b/cc-chatbot/docker/entrypoint.sh @@ -0,0 +1,99 @@ +#!/bin/bash +set -e + +EXTRA_ARGS=() +while [[ $# -gt 0 ]]; do + case $1 in + --port) + LLAMA_STACK_PORT="$2" + shift 2 + ;; + *) + # Unknown option, pass through + EXTRA_ARGS+=("$1") + shift + ;; + esac +done + +echo "==============================================" +echo "cc-vec-bot (llama-stack + ollama)" +echo "==============================================" +echo "LLAMA_STACK_PORT: ${LLAMA_STACK_PORT:-5001}" +echo "INFERENCE_MODEL: ${INFERENCE_MODEL:-tinyllama}" +echo "OLLAMA_URL: ${OLLAMA_URL:-}" +echo "CHATBOT_PORT: ${CHATBOT_PORT:-8008}" +echo "==============================================" + +# Determine Ollama URL +if [ -z "$OLLAMA_URL" ]; then + # No external Ollama URL provided - start local Ollama + echo "Starting local Ollama server..." + ollama serve & + OLLAMA_PID=$! + OLLAMA_URL="http://localhost:11434" + export OLLAMA_URL + + # Wait for Ollama to be ready + echo "Waiting for Ollama to start..." + for i in {1..30}; do + if curl -s http://localhost:11434/api/tags >/dev/null 2>&1; then + echo "Ollama is ready" + break + fi + if [ $i -eq 30 ]; then + echo "ERROR: Ollama failed to start" + exit 1 + fi + sleep 1 + done + + # Pull model if specified and PREFETCH_MODEL wasn't set at build time + if [ -n "$INFERENCE_MODEL" ]; then + echo "Checking if model needs to be pulled: $INFERENCE_MODEL" + if ! ollama list | grep -q "^${INFERENCE_MODEL}"; then + echo "Pulling model: $INFERENCE_MODEL" + ollama pull "$INFERENCE_MODEL" + else + echo "Model $INFERENCE_MODEL already available" + fi + fi +else + # External Ollama URL provided - verify connectivity + echo "Using external Ollama at: $OLLAMA_URL" + for i in {1..10}; do + if curl -s "${OLLAMA_URL}/api/tags" >/dev/null 2>&1; then + echo "External Ollama is reachable" + break + fi + if [ $i -eq 10 ]; then + echo "WARNING: Cannot reach external Ollama at $OLLAMA_URL" + fi + sleep 1 + done +fi + +# Start the chatbot in the background +echo "Starting chatbot-frontend on port ${CHATBOT_PORT}..." +(cd /opt/chatbot-frontend && uvicorn api:app --host 0.0.0.0 --port "${CHATBOT_PORT}" &) +SUCCESS=$? +if [ $SUCCESS -ne 0 ]; then + cat /var/log/chatbot.log + echo "ERROR: Failed to start chatbot-frontend" + exit 1 +fi +CHATBOT_PID=$! +echo "Chatbot-frontend started with PID ${CHATBOT_PID}" + +# Give chatbot a moment to start +sleep 2 + +# Start llama-stack server +# The distribution-starter base image includes llama-stack +echo "Starting llama-stack on port ${LLAMA_STACK_PORT}..." +export OLLAMA_URL="${OLLAMA_URL}" +export INFERENCE_MODEL="${INFERENCE_MODEL}" +exec llama stack run starter \ + --port "${LLAMA_STACK_PORT}" \ + "${EXTRA_ARGS[@]}" +