diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..3fe5b51c --- /dev/null +++ b/.dockerignore @@ -0,0 +1,69 @@ +# Git +.git +.gitignore +.gitattributes + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +env/ +venv/ +ENV/ +.venv +*.egg-info/ +dist/ +build/ +*.egg + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Docker (don't copy docker files into the image) +Dockerfile +docker-compose*.yml +.dockerignore + +# Database lock files +*.db +*.sqlite +*.sqlite3 +.db_initialized + +# Logs +logs/ +*.log + +# Test files +.coverage +htmlcov/ +.pytest_cache/ +.tox/ + +# CI/CD +.github/ +.travis.yml +.circleci/ + +# Temporary files +tmp/ +temp/ +*.tmp + +# Secrets and credentials (prevent accidental inclusion) +*.pem +*.key +service-account.json +gcp-key.json +secret_key +secret_csrf \ No newline at end of file diff --git a/.gitignore b/.gitignore index 4f8ca2d8..4a42b168 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,11 @@ static/img/status/build-windows.svg # Gunicorn gunicorn.pid +gunicorn.ctl + +# Docker-generated files (leak via .:/app volume mount) +.db_initialized +migrations/versions/*docker_auto_migration* # OS Generated Files .DS_Store diff --git a/DOCKER.md b/DOCKER.md new file mode 100644 index 00000000..f33f4bc9 --- /dev/null +++ b/DOCKER.md @@ -0,0 +1,468 @@ +# 🐳 Sample Platform — Docker Setup Guide + +> One-command local development environment for the CCExtractor Sample Platform. + +--- + +## Table of Contents + +- [Prerequisites](#prerequisites) +- [Quick Start](#quick-start) +- [Architecture](#architecture) +- [Configuration](#configuration) + - [Environment Variables Reference](#environment-variables-reference) + - [Google Cloud Storage](#google-cloud-storage) + - [GitHub Integration](#github-integration) +- [Usage](#usage) + - [Starting the Platform](#starting-the-platform) + - [Stopping the Platform](#stopping-the-platform) + - [Viewing Logs](#viewing-logs) + - [Live Code Reloading (Development)](#live-code-reloading-development) + - [Full Reset (Clean Slate)](#full-reset-clean-slate) +- [Database](#database) + - [Connecting Directly](#connecting-directly) + - [Migrations](#migrations) + - [Re-seeding](#re-seeding) +- [Design Decisions](#design-decisions) +- [Troubleshooting](#troubleshooting) +- [File Overview](#file-overview) + +--- + +## Prerequisites + +| Tool | Minimum Version | Check Command | +| ------------------ | --------------- | ------------------------ | +| **Docker Engine** | 20.10+ | `docker --version` | +| **Docker Compose** | 2.0+ (V2) | `docker compose version` | + +> **Windows / macOS**: Install [Docker Desktop](https://www.docker.com/products/docker-desktop/) — it bundles both. +> +> **Linux**: Install Docker Engine + the Compose plugin via [the official docs](https://docs.docker.com/engine/install/). + +--- + +## Quick Start + +```bash +# 1. Clone the repository (if you haven't already) +git clone https://github.com/CCExtractor/sample-platform.git +cd sample-platform + +# 2. Create your environment file from the template +cp env.example .env +# → Edit .env with your own values (see Configuration below) + +# 3. (Optional) Place your GCP service-account key +# If you don't have one, the app will still start but GCS features won't work. +# See "Google Cloud Storage" section below. + +# 4. Build and start everything +docker compose up -d --build + +# 5. Wait ~20 seconds for MySQL to initialize, then open: +# http://localhost:5000 +``` + +**Default admin credentials** (set in `.env`): + +| Field | Value | +| -------- | ------------------- | +| Email | `admin@example.com` | +| Password | `admin` | + +--- + +## Architecture + +``` +┌─────────────────────────────────────────────────────────┐ +│ Docker Network │ +│ sample_platform_network │ +│ │ +│ ┌──────────────┐ ┌─────────────────────────┐ │ +│ │ MySQL 8.0 │◄──────── │ Flask Backend (Py 3.11) │ │ +│ │ │ :3306 │ │ │ +│ │ db_data vol │ │ Gunicorn (4 workers) │ │ +│ └──────────────┘ │ :5000 │ │ +│ │ │ │ +│ │ .:/app (live mount) │ │ +│ │ repository_data vol │ │ +│ └─────────────────────────┘ │ +│ │ │ +└──────────────────────────────────────│──────────────────┘ + │ + Host :5000 + http://localhost:5000 +``` + +| Service | Container Name | Image | Exposed Port | +| ----------- | ------------------------- | ----------------------- | ------------ | +| **db** | `sample_platform_db` | `mysql:8.0` | `3306` | +| **backend** | `sample_platform_backend` | Built from `Dockerfile` | `5000` | + +--- + +## Configuration + +All configuration is driven by the **`.env`** file. Docker Compose reads it via `env_file:`. + +> **Single source of truth**: The `.env` file is the only place you set values. There is no +> duplicate `environment:` block in `docker-compose.yml` — this avoids the common problem of +> environment overriding env_file silently. + +### Environment Variables Reference + +#### MySQL + +| Variable | Description | Default | +| ---------------------- | ------------------------------------------------ | ------------------ | +| `MYSQL_ROOT_PASSWORD` | MySQL root password | `root` | +| `MYSQL_USER` | Application-level DB user (auto-created by MySQL)| `sample_platform` | +| `MYSQL_PASSWORD` | Password for the application DB user | `sample_platform` | +| `MYSQL_DATABASE` | Database name | `sample_platform` | + +> MySQL auto-creates `MYSQL_USER` with full grants on `MYSQL_DATABASE`. The root user +> is only used for the healthcheck and initial bootstrap — the application connects as +> the dedicated user. + +#### Database URI + +| Variable | Description | Default | +| -------------------------- | ---------------------------------------- | ------- | +| `SQLALCHEMY_DATABASE_URI` | Full SQLAlchemy connection string | *(see env.example)* | + +> Must be kept consistent with the MySQL variables above. Format: +> `mysql+pymysql://:@db/?charset=utf8mb4` + +#### Networking + +| Variable | Description | Default | +| ------------------ | --------------------------------- | ------- | +| `APP_PORT` | Host port for the Flask app | `5000` | +| `DB_EXTERNAL_PORT` | Host port for direct MySQL access | `3306` | + +#### Security + +| Variable | Description | Default | +| ------------ | ---------------------------------------------------- | ------------------------- | +| `SECRET_KEY` | Flask session secret (fallback; file overrides it) | `change-me-in-production` | +| `HMAC_KEY` | Used by `mod_auth` for email verification tokens | `change-me-in-production` | + +> **How secret keys actually work**: `run.py` reads binary files `/app/secret_key` and +> `/app/secret_csrf` at startup and uses their contents as the real `SECRET_KEY` and +> `CSRF_SESSION_KEY`. These files are auto-generated by the entrypoint on first run. +> The `SECRET_KEY` env var in `.env` is only a config-level fallback used briefly +> before the file-based keys overwrite it. + +⚠️ **Change `SECRET_KEY` and `HMAC_KEY`** before deploying to any shared environment. + +#### Google Cloud Storage + +| Variable | Description | Default | +| ---------------------- | ------------------------------------------------ | --------------------- | +| `GCS_BUCKET_NAME` | GCS bucket for sample file storage | `sample-platform-dev` | +| `SERVICE_ACCOUNT_FILE` | Filename of the GCP key (relative to project root)| `service-account.json`| + +> ⚠️ **Must be non-empty**: `run.py` calls `client.bucket(name)` at import time. Use your real bucket name or +> keep the default placeholder. + +#### GitHub + +| Variable | Description | Default | +| ------------------- | ---------------------------------- | ------------- | +| `GITHUB_TOKEN` | Personal access token for API | *(empty)* | +| `GITHUB_OWNER` | GitHub org/user owning the repo | `CCExtractor` | +| `GITHUB_REPOSITORY` | Repository name | `ccextractor` | + +#### Admin Bootstrap + +| Variable | Description | Default | +| ---------------- | ----------------------------------- | ------------------- | +| `ADMIN_USERNAME` | Username for auto-created admin | `admin` | +| `ADMIN_EMAIL` | Email for auto-created admin | `admin@example.com` | +| `ADMIN_PASSWORD` | Password for auto-created admin | `admin` | + +#### Email (Mailgun) + +| Variable | Description | Default | +| --------------- | ---------------------- | --------- | +| `EMAIL_DOMAIN` | Mailgun sending domain | *(empty)* | +| `EMAIL_API_KEY` | Mailgun API key | *(empty)* | + +#### Feature Flags + +| Variable | Description | Default | +| --------------------- | ------------------------------------------------- | ------------- | +| `INSTALL_SAMPLE_DATA` | Seed DB with sample categories & regression tests | `false` | +| `MAINTENANCE` | Enable maintenance mode | `false` | +| `FLASK_ENV` | Flask environment (`development` / `production`) | `development` | + +--- + +### Google Cloud Storage + +The platform uses GCS to store sample files. To enable this: + +1. Create a GCP service account with **Storage Object Admin** permissions. +2. Download the JSON key file. +3. Save it as `./service-account.json` in the project root. +4. Set `GCS_BUCKET_NAME` in your `.env`. + +**Mounting Strategy**: +- **Enabled**: If `GCS_BUCKET_NAME` is set, the container mounts the bucket to `/mnt/gcs_repository` and updates `SAMPLE_REPOSITORY` to point there. This ensures the GCS mount doesn't conflict with your local code volume. +- **Disabled**: If not set, `SAMPLE_REPOSITORY` defaults to `/repository`, which is a standard Docker volume persisting data to your local machine. + +--- + +### GitHub Integration + +GitHub features (CI webhooks, PR testing) require a **Personal Access Token** with `public_repo` scope: + +1. Generate a token at [github.com/settings/tokens](https://github.com/settings/tokens). +2. Set `GITHUB_TOKEN` in your `.env`. + +--- + +## Usage + +### Starting the Platform + +```bash +docker compose up -d --build +``` + +- `-d` runs in detached mode (background). +- `--build` rebuilds the image if `Dockerfile` or `requirements.txt` changed. + +**Startup sequence** (handled by `docker-entrypoint.sh`): + +1. MySQL starts and becomes healthy (~10 s). +2. Backend container starts and the entrypoint: + 1. **Generates secret key files** (`/app/secret_key`, `/app/secret_csrf`) if they don't exist. + 2. **Initializes a git repo** at `/app` (required by GitPython for build-commit display). + 3. **Creates directories** mirroring `install/install.sh` (including `TempFiles/`, `TestFiles/media/`, `TestData/ci-linux/`, `TestData/ci-windows/`, etc.). + 4. **Copies sample files** (`sample1.ts`, `sample2.ts`) to `TestFiles/` and CI scripts to `TestData/`. + 5. **Waits for MySQL** to accept connections. + 6. **Runs database migrations** (stamps HEAD on fresh databases to avoid conflicts). + 7. **Creates the admin user** if no admin exists in the DB (checked via SQL query). + 8. **Seeds sample data** if `INSTALL_SAMPLE_DATA=true` and no admin existed. + 9. **Starts Gunicorn** on port 5000. + +### Stopping the Platform + +```bash +docker compose down +``` + +> Data is persisted in Docker volumes (`db_data`, `repository_data`), so nothing is lost. + +### Viewing Logs + +```bash +# All services +docker compose logs -f + +# Backend only +docker logs -f sample_platform_backend + +# MySQL only +docker logs -f sample_platform_db +``` + +### Live Code Reloading (Development) + +The project root is mounted into the container at `/app`, so **any code change on your host +is immediately visible inside the container**. However, Gunicorn doesn't auto-reload by default. + +To pick up changes without rebuilding: + +```bash +# Restart just the backend (fast, no rebuild) +docker compose restart backend +``` + +If you changed `requirements.txt` or `Dockerfile`, you need a full rebuild: + +```bash +docker compose up -d --build +``` + +### Full Reset (Clean Slate) + +```bash +# Remove containers AND volumes (wipes DB + repository data + secret keys) +docker compose down -v + +# Rebuild from scratch +docker compose up -d --build +``` + +--- + +## Database + +### Connecting Directly + +From your host, using the application user: + +```bash +mysql -h 127.0.0.1 -P 3306 -u sample_platform -psample_platform sample_platform +``` + +Or via Docker: + +```bash +docker exec -it sample_platform_db mysql -u sample_platform -psample_platform sample_platform +``` + +### Migrations + +The entrypoint handles migrations automatically. To run them manually: + +```bash +# Generate a new migration +docker exec -it sample_platform_backend flask db migrate -m "Description" + +# Apply pending migrations +docker exec -it sample_platform_backend flask db upgrade + +# View migration history +docker exec -it sample_platform_backend flask db history + +# Check current migration state +docker exec -it sample_platform_backend flask db current +``` + +### Re-seeding + +The entrypoint checks the database directly for an existing admin user — there is no +file-based sentinel. To re-seed from scratch: + +```bash +# Full reset — wipes the DB volume and restarts +docker compose down -v +docker compose up -d --build +``` + +Or to re-seed without wiping: + +```bash +# Delete the admin user, then restart +docker exec sample_platform_db mysql -u root -proot sample_platform -e "DELETE FROM user;" +docker compose restart backend +``` + +--- + +## Design Decisions + +### Why mount GCS to `/mnt/gcs_repository`? + +We previously mounted GCS directly to `/repository`. However, `docker-compose.yml` mounts a local volume to `/repository` for persistence. GCS FUSE requires an empty directory (or specific flags) and mounting it *over* a Docker volume hides the volume's contents and causes "non-empty directory" errors. + +**Solution**: We mount GCS to a dedicated, clean path (`/mnt/gcs_repository`) and export `SAMPLE_REPOSITORY` to point to it. The application respects this variable, seamlessly switching between local storage and Cloud Storage without code changes. + +### Why are secret keys generated at runtime, not in the Dockerfile? + +`run.py` reads two binary files (`secret_key`, `secret_csrf`) to set Flask's `SECRET_KEY` and +`CSRF_SESSION_KEY`. We generate these files in the **entrypoint** (runtime) rather than the +**Dockerfile** (build time) because: + +- **Security**: Build-time files are baked into image layers and visible via `docker history`. +- **Uniqueness**: Each container gets its own keys instead of sharing from a single image. +- **Dev compatibility**: The `.:/app` volume mount would overwrite build-time files anyway. + +### Why does the container need a git repo? + +`run.py` line 69-70 uses GitPython to read `repo.head.object.hexsha` for build-commit display +in the UI. It crashes at import time if no `.git` directory exists. The `.dockerignore` (correctly) +excludes `.git`, so the entrypoint creates a minimal repo at runtime. + +### Why use `flask db stamp head` instead of `flask db upgrade`? + +On a fresh database, `create_all()` (called by the application startup or `init_db.py`) builds the full schema. If we were to run `flask db upgrade` instead, it might try to create tables but could conflict with `create_all` logic or require a linear migration history that matches the current models exactly. Instead, we let `create_all` or the application manage table creation, and stamp the database as "already at HEAD" so Alembic knows the schema is current. + +### Why a dedicated MySQL user? + +MySQL's `MYSQL_USER` + `MYSQL_PASSWORD` env vars auto-create a user with grants only on +`MYSQL_DATABASE`. The root user is used only for the healthcheck probe. This follows the +principle of least privilege. + +### Why `env_file` without a duplicate `environment:` block? + +Docker Compose's `environment:` section **overrides** `env_file` for any duplicate keys. +Having both is confusing — you think you're editing `.env` but the compose file silently +overrides your changes. Using `env_file` alone keeps `.env` as the single source of truth. + +--- + +## Troubleshooting + +### Container exits immediately + +```bash +docker logs sample_platform_backend +``` + +| Symptom | Cause & Fix | +| ----------------------------------------- | ------------------------------------------------------------------------------ | +| `SecretKeyInstallationException` | `/app/secret_key` or `/app/secret_csrf` missing. Entrypoint should create them. Rebuild image. | +| `git.exc.InvalidGitRepositoryError` | `.git` directory missing. Entrypoint should init one. Rebuild image. | +| `No module named 'config'` | `config.py` not in build context. Check `.dockerignore`. | +| `MySQL connection timeout` | MySQL isn't ready. Increase `retries` in healthcheck or `sleep` in entrypoint. | +| `ModuleNotFoundError: No module named 'X'` | Missing pip dependency. Add to `requirements.txt` and rebuild. | +| `OperationalError: (1045, "Access denied")`| `SQLALCHEMY_DATABASE_URI` user/password doesn't match `MYSQL_USER`/`MYSQL_PASSWORD` in `.env`. | + +### Port conflict + +If port 5000 or 3306 is already in use, change in `.env`: + +```env +APP_PORT=8080 +DB_EXTERNAL_PORT=3307 +``` + +### Database migration errors on fresh DB + +The entrypoint stamps the database at HEAD on first run to avoid conflicts between +`create_all()` and Alembic. If you still see errors, do a full reset: + +```bash +docker compose down -v +docker compose up -d --build +``` + +### Windows line-ending issues + +If you see `/bin/bash^M: bad interpreter`, the entrypoint has Windows-style line endings. The +Dockerfile runs `sed -i 's/\r$//'` to fix this automatically. If you've volume-mounted and +edited the file on Windows, convert manually: + +```bash +# Git Bash +sed -i 's/\r$//' docker-entrypoint.sh + +# Or configure git globally +git config core.autocrlf input +``` + +--- + +## File Overview + +``` +. +├── .dockerignore # Files excluded from Docker build context +├── .env # Your local config (git-ignored, single source of truth) +├── Dockerfile # Image: Python 3.11 + system deps + pip install +├── DOCKER.md # ← You are here +├── docker-compose.yml # Orchestration (db + backend), reads .env +├── docker-entrypoint.sh # Runtime: secrets → git → dirs → DB wait → migrate → gunicorn +├── env.example # Template for .env (committed to git) +├── config.py # Flask config (reads env vars, no hardcoded project values) +├── utility.py # Helper functions (GCS download fallback, etc.) +├── requirements.txt # Python dependencies +├── run.py # Flask application entry point +└── service-account.json # GCP key (If missing, Docker might create a directory here; entrypoint handles this) +``` diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..bfc486de --- /dev/null +++ b/Dockerfile @@ -0,0 +1,81 @@ +FROM python:3.11-slim-bullseye + +# Environment variables to optimize Python for Docker +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + DEBIAN_FRONTEND=noninteractive \ + FLASK_APP=run.py + +# 1. Install System Dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + pkg-config \ + default-libmysqlclient-dev \ + default-mysql-client \ + libxml2-dev \ + libxslt-dev \ + libmagic1 \ + mediainfo \ + git \ + lsb-release \ + curl \ + netcat-openbsd \ + gnupg2 \ + fuse \ + && export GCSFUSE_REPO=gcsfuse-`lsb_release -c -s` \ + && echo "deb [signed-by=/usr/share/keyrings/cloud.google.asc] https://packages.cloud.google.com/apt $GCSFUSE_REPO main" | tee /etc/apt/sources.list.d/gcsfuse.list \ + && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | tee /usr/share/keyrings/cloud.google.asc \ + && apt-get update \ + && apt-get install -y gcsfuse \ + && rm -rf /var/lib/apt/lists/* + +# 2. Setup Workspace +WORKDIR /app + +# 3. Install all Python dependencies in a single layer +COPY requirements.txt . +RUN pip install --no-cache-dir --upgrade pip wheel setuptools && \ + pip install --no-cache-dir mysqlclient lxml cryptography && \ + pip install --no-cache-dir --default-timeout=100 -r requirements.txt && \ + pip install --no-cache-dir gunicorn + +# 4. Copy Application Code +COPY run.py manage.py config.py config_parser.py config_sample.py database.py \ + decorators.py exceptions.py log_configuration.py mailer.py utility.py \ + bootstrap_gunicorn.py ./ +COPY mod_auth/ mod_auth/ +COPY mod_ci/ mod_ci/ +COPY mod_customized/ mod_customized/ +COPY mod_health/ mod_health/ +COPY mod_home/ mod_home/ +COPY mod_regression/ mod_regression/ +COPY mod_sample/ mod_sample/ +COPY mod_test/ mod_test/ +COPY mod_upload/ mod_upload/ +COPY templates/ templates/ +COPY static/ static/ +COPY install/ install/ +COPY migrations/ migrations/ +COPY tests/ tests/ + +# 5. Create logs directory & setup entrypoint +COPY docker-entrypoint.sh /usr/local/bin/ +RUN mkdir -p logs && \ + sed -i 's/\r$//' /usr/local/bin/docker-entrypoint.sh && \ + chmod +x /usr/local/bin/docker-entrypoint.sh + +# 6. Create a non-root user for running the application server +RUN apt-get update && apt-get install -y --no-install-recommends gosu && \ + rm -rf /var/lib/apt/lists/* && \ + groupadd --gid 1001 appuser && \ + useradd --uid 1001 --gid appuser --shell /bin/bash --create-home appuser && \ + chown -R appuser:appuser /app + +# 7. Switch to non-root user +USER appuser + +# 8. Expose the Flask Port +EXPOSE 5000 + +# 8. Define the runtime command +ENTRYPOINT ["docker-entrypoint.sh"] \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..348fa958 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,64 @@ +services: + # --- 1. Database Service --- + db: + image: mysql:8.0 + container_name: sample_platform_db + environment: + MYSQL_ROOT_PASSWORD: ${MYSQL_ROOT_PASSWORD} + MYSQL_DATABASE: ${MYSQL_DATABASE} + MYSQL_USER: ${MYSQL_USER} + MYSQL_PASSWORD: ${MYSQL_PASSWORD} + MYSQL_CHARSET: utf8mb4 + MYSQL_COLLATION: utf8mb4_unicode_ci + ports: + - "${DB_EXTERNAL_PORT:-3306}:3306" + volumes: + - db_data:/var/lib/mysql + command: --default-authentication-plugin=mysql_native_password --character-set-server=utf8mb4 --collation-server=utf8mb4_unicode_ci + healthcheck: + test: [ "CMD", "mysqladmin", "ping", "-h", "localhost", "-u", "root", "-p${MYSQL_ROOT_PASSWORD}" ] + interval: 5s + timeout: 5s + retries: 20 + networks: + - sample_platform_network + + # --- 2. Backend Service (Flask) --- + backend: + build: . + container_name: sample_platform_backend + user: root + ports: + - "${APP_PORT:-5000}:5000" + depends_on: + db: + condition: service_healthy + volumes: + # Live-reload: mount source code for development + - .:/app + # Prevent host pollution from Python caches + - /app/__pycache__ + - /app/logs + # Mount the Service Account Key (read-only) + - ./service-account.json:/app/service-account.json:ro + # Persistent storage for sample files + - repository_data:/repository + env_file: + - .env + networks: + - sample_platform_network + restart: unless-stopped + cap_add: + - SYS_ADMIN + devices: + - /dev/fuse + +networks: + sample_platform_network: + driver: bridge + +volumes: + db_data: + driver: local + repository_data: + driver: local diff --git a/docker-entrypoint.sh b/docker-entrypoint.sh new file mode 100644 index 00000000..db1ade29 --- /dev/null +++ b/docker-entrypoint.sh @@ -0,0 +1,292 @@ +#!/bin/bash +set -e + +# Professional Logging Function +log() { + local message="$1" + echo -e "\033[1;34m[Platform]\033[0m ${message}" + return 0 +} + +# --- 1. Ensure Secret Key Files Exist --- + +if [[ ! -f "/app/secret_key" ]]; then + log "Generating secret_key file..." + head -c 24 /dev/urandom > /app/secret_key +fi +if [[ ! -f "/app/secret_csrf" ]]; then + log "Generating secret_csrf file..." + head -c 24 /dev/urandom > /app/secret_csrf +fi + +# --- 2. Ensure Git Repo Exists --- +git config --global --add safe.directory /app +if [[ ! -d "/app/.git" ]]; then + log "Initializing git repository (required by GitPython for build commit display)..." + git init /app > /dev/null 2>&1 + git -C /app config user.email "docker@sample-platform.local" + git -C /app config user.name "Docker" + git -C /app add -A > /dev/null 2>&1 + git -C /app commit -m "Docker build" --allow-empty > /dev/null 2>&1 +fi + +# --- 3. Ensure GCP Service Account File Exists --- +SA_PATH="/app/service-account.json" +REAL_SA_PATH="$SA_PATH" + +# Docker mounts a directory if the host file doesn't exist. +if [[ -d "$SA_PATH" ]]; then + log "WARNING: $SA_PATH is a directory (likely because ./service-account.json is missing on host)." + log "Using internal path for generated credentials..." + REAL_SA_PATH="/app/generated-service-account.json" + export GOOGLE_APPLICATION_CREDENTIALS="$REAL_SA_PATH" + export SERVICE_ACCOUNT_FILE="generated-service-account.json" +fi + +if [[ ! -f "$REAL_SA_PATH" ]]; then + log "Generating dummy service-account.json at $REAL_SA_PATH (GCS will use local fallback)..." + python3 -c " +import json +from cryptography.hazmat.primitives.asymmetric import rsa +from cryptography.hazmat.primitives import serialization + +try: + key = rsa.generate_private_key(public_exponent=65537, key_size=2048) + pem = key.private_bytes(serialization.Encoding.PEM, + serialization.PrivateFormat.TraditionalOpenSSL, + serialization.NoEncryption()).decode() +except Exception as e: + print(f'WARNING: Key generation failed: {e}') + pem = 'DUMMY_KEY' + +sa = { + 'type': 'service_account', + 'project_id': 'docker-dev', + 'private_key_id': 'docker-dev-key', + 'private_key': pem, + 'client_email': 'docker-dev@docker-dev.iam.gserviceaccount.com', + 'client_id': '000000000000', + 'auth_uri': 'https://accounts.google.com/o/oauth2/auth', + 'token_uri': 'https://oauth2.googleapis.com/token', +} +with open('$REAL_SA_PATH', 'w') as f: + json.dump(sa, f, indent=2) +" +fi + +# Ensure logs directory exists (critical for gunicorn) +mkdir -p logs + +# --- 4. Configure & Mount Storage --- +# Determine where the repository is located. +# If GCS_BUCKET_NAME is set, we mount it to a clean path and use that. +# Otherwise, we use the default volume mount at /repository. + +if [[ -n "$GCS_BUCKET_NAME" ]] && [[ -f "$REAL_SA_PATH" ]]; then + log "GCS_BUCKET_NAME is set to '$GCS_BUCKET_NAME'. Configuring GCS mount..." + + # Use a separate mount point to avoid conflict with local volume at /repository + GCS_MOUNT_POINT="/mnt/gcs_repository" + mkdir -p "$GCS_MOUNT_POINT" + + log "Mounting '$GCS_BUCKET_NAME' to '$GCS_MOUNT_POINT'..." + set +e + gcsfuse --key-file "$REAL_SA_PATH" \ + --implicit-dirs \ + --uid 1001 --gid 1001 \ + --file-mode 666 --dir-mode 777 \ + -o allow_other \ + --debug_gcs \ + --debug_fuse \ + --log-file /tmp/gcsfuse_debug.log \ + --log-format text \ + "$GCS_BUCKET_NAME" "$GCS_MOUNT_POINT" > /tmp/gcsfuse.log 2>&1 + MOUNT_STATUS=$? + set -e + + if [[ $MOUNT_STATUS -eq 0 ]]; then + log "SUCCESS: GCS bucket mounted at $GCS_MOUNT_POINT" + export SAMPLE_REPOSITORY="$GCS_MOUNT_POINT" + else + log "CRITICAL ERROR: Failed to mount GCS bucket." + log "--- gcsfuse stderr ---" + cat /tmp/gcsfuse.log + log "----------------------" + if [[ -f "/tmp/gcsfuse_debug.log" ]]; then + log "--- gcsfuse debug log (last 20 lines) ---" + tail -n 20 /tmp/gcsfuse_debug.log + log "----------------------" + fi + exit 1 + fi +else + log "GCS not configured. Using local storage." + # Default to /repository if not set + export SAMPLE_REPOSITORY="${SAMPLE_REPOSITORY:-/repository}" +fi + +# --- 5. Setup Repository Structure --- +REPO="$SAMPLE_REPOSITORY" +log "Ensuring repository structure exists in: $REPO" + +mkdir -p "${REPO}/ci-tests" +mkdir -p "${REPO}/unsafe-ccextractor" +mkdir -p "${REPO}/TempFiles" +mkdir -p "${REPO}/LogFiles" +mkdir -p "${REPO}/TestResults" +mkdir -p "${REPO}/TestFiles" +mkdir -p "${REPO}/TestFiles/media" +mkdir -p "${REPO}/QueuedFiles" +mkdir -p "${REPO}/TestData/ci-linux" +mkdir -p "${REPO}/TestData/ci-windows" +mkdir -p "${REPO}/vm_data" + +# Ensure appuser has write access to the repository +chown -R appuser:appuser "$REPO" 2>/dev/null || true + +# --- 6. Install Initial Data (if requested) --- +if [[ "${INSTALL_SAMPLE_DATA}" = "true" ]] && [[ -d "/app/install/sample_files" ]]; then + log "Copying sample files to ${REPO}/TestFiles/..." + cp -rn /app/install/sample_files/* "${REPO}/TestFiles/" 2>/dev/null || true +fi + +if [[ -d "/app/install/ci-vm/ci-windows/ci" ]]; then + cp -rn /app/install/ci-vm/ci-windows/ci/* "${REPO}/TestData/ci-windows/" 2>/dev/null || true +fi +if [[ -d "/app/install/ci-vm/ci-linux/ci" ]]; then + cp -rn /app/install/ci-vm/ci-linux/ci/* "${REPO}/TestData/ci-linux/" 2>/dev/null || true +fi + +# --- 7. Wait for Database Service --- +log "Waiting for MySQL at ${DB_HOST:-db}:${DB_PORT:-3306}..." +timeout=60 +counter=0 +DB_HOST="${DB_HOST:-db}" +DB_PORT="${DB_PORT:-3306}" +while ! nc -z "$DB_HOST" "$DB_PORT"; do + sleep 1 + counter=$((counter + 1)) + if [[ $counter -ge $timeout ]]; then + log "ERROR: MySQL connection timeout after ${timeout} seconds" + exit 1 + fi +done +log "MySQL is up and reachable." + +# Give MySQL extra time to finish initialization +sleep 3 + +# --- 8. Database Schema Setup --- +# Two schema mechanisms exist in this codebase: +# a) database.py → create_session() calls Base.metadata.create_all() when the app is imported +# b) Flask-Migrate (Alembic) for versioned migrations + +ALEMBIC_EXISTS=$(python3 -c " +import pymysql, os +try: + conn = pymysql.connect(host='${DB_HOST}', port=${DB_PORT}, + user='${MYSQL_USER:-root}', password='${MYSQL_ROOT_PASSWORD:-root}', + database='${MYSQL_DATABASE:-sample_platform}') + cursor = conn.cursor() + cursor.execute(\"SHOW TABLES LIKE 'alembic_version'\") + result = cursor.fetchone() + conn.close() + print('yes' if result else 'no') +except Exception: + print('no') +" 2>/dev/null) + +if [[ "$ALEMBIC_EXISTS" = "no" ]]; then + log "Fresh database detected. Setting up schema..." + + # Ensure migrations directory is properly initialized + if [[ ! -d "migrations/versions" ]]; then + log "Initializing fresh migrations folder..." + rm -rf migrations + flask db init || { + log "ERROR: Failed to initialize migrations" + exit 1 + } + fi + + # Import the app (triggers create_all via create_session), then stamp HEAD. + log "Creating tables and stamping migration head..." + flask db stamp head || { + log "ERROR: Could not stamp migration head. Cannot proceed." + exit 1 + } + + flask db migrate -m "Docker auto-migration" 2>/dev/null || log "No new migrations needed" + flask db upgrade 2>/dev/null || log "No upgrades needed" +else + log "Existing database detected. Applying any pending migrations..." + + if [[ ! -d "migrations/versions" ]]; then + rm -rf migrations + flask db init || { + log "ERROR: Failed to initialize migrations" + exit 1 + } + flask db stamp head || { + log "ERROR: Could not stamp migration head." + exit 1 + } + fi + + flask db migrate -m "Docker auto-migration" 2>/dev/null || log "No new migrations detected" + flask db upgrade 2>/dev/null || log "No upgrades to apply" +fi + +log "Database schema is ready." + +# --- 9. Initialize Admin User and Sample Data --- +ADMIN_EXISTS=$(python3 -c " +import pymysql, os +try: + conn = pymysql.connect(host='${DB_HOST}', port=${DB_PORT}, + user='${MYSQL_USER:-root}', password='${MYSQL_ROOT_PASSWORD:-root}', + database='${MYSQL_DATABASE:-sample_platform}') + cursor = conn.cursor() + cursor.execute(\"SELECT COUNT(*) FROM user WHERE role = 'admin'\") + result = cursor.fetchone() + conn.close() + print('yes' if result and result[0] > 0 else 'no') +except Exception: + print('no') +" 2>/dev/null) + +if [[ -f "install/init_db.py" ]]; then + if [[ "$ADMIN_EXISTS" = "no" ]]; then + log "Creating Admin User..." + + python3 install/init_db.py \ + "$SQLALCHEMY_DATABASE_URI" \ + "${ADMIN_USERNAME:-admin}" \ + "${ADMIN_EMAIL:-admin@example.com}" \ + "${ADMIN_PASSWORD:-admin}" || log "Admin creation skipped (may already exist)" + + if [[ "$INSTALL_SAMPLE_DATA" = "true" ]] && [[ -f "install/sample_db.py" ]]; then + log "Populating sample data..." + python3 install/sample_db.py "$SQLALCHEMY_DATABASE_URI" || log "Sample data population skipped" + fi + else + log "Admin user already exists — skipping initialization" + fi +else + log "WARNING: install/init_db.py not found — skipping admin user creation" +fi + +# --- 10. Start Server --- +chown -R appuser:appuser /app 2>/dev/null || true + +log "Starting Gunicorn on 0.0.0.0:5000 as appuser..." +log "Application accessible at http://localhost:${APP_PORT:-5000}" + +exec gosu appuser gunicorn \ + --workers 4 \ + --bind 0.0.0.0:5000 \ + --timeout 120 \ + --access-logfile - \ + --error-logfile - \ + --log-level info \ + run:app \ No newline at end of file diff --git a/env.example b/env.example new file mode 100644 index 00000000..25faf19c --- /dev/null +++ b/env.example @@ -0,0 +1,63 @@ +# ============================================================ +# Sample Platform – Docker Environment Variables +# Copy env.example → .env, then edit the values below. +# ============================================================ + +# ---------- MySQL ---------- +MYSQL_ROOT_PASSWORD=root +# Application-level DB user (MySQL auto-creates this with access to MYSQL_DATABASE) +MYSQL_USER=sample_platform +MYSQL_PASSWORD=sample_platform +MYSQL_DATABASE=sample_platform + +# ---------- Networking ---------- +# Port exposed on the HOST for the Flask app (container always listens on 5000) +APP_PORT=5000 +# Port exposed on the HOST for direct MySQL access (optional, for debugging) +DB_EXTERNAL_PORT=3306 + +# ---------- Flask ---------- +FLASK_APP=run.py +FLASK_ENV=development + +# ---------- Database URI ---------- +# Constructed from the MySQL vars above. Uses the container hostname 'db'. +SQLALCHEMY_DATABASE_URI=mysql+pymysql://sample_platform:sample_platform@db/sample_platform?charset=utf8mb4 + +# ---------- Internal Container Paths ---------- +DB_HOST=db +DB_PORT=3306 +SAMPLE_REPOSITORY=/repository +INSTALL_FOLDER=/app +GOOGLE_APPLICATION_CREDENTIALS=/app/service-account.json + +# ---------- Security ---------- +SECRET_KEY=change-me-in-production +HMAC_KEY=change-me-in-production + +# ---------- Google Cloud Storage (Optional for Docker) ---------- +# For local Docker dev: leave as-is. The entrypoint auto-generates a dummy +# service-account.json, and downloads fall back to local file serving. +# For production: set your real bucket name. +# NOTE: Must be non-empty — run.py crashes on empty bucket name at import time. +GCS_BUCKET_NAME=sample-platform-dev + +# ---------- Admin Bootstrap ---------- +# Created automatically on first run +ADMIN_USERNAME=admin +ADMIN_EMAIL=admin@example.com +ADMIN_PASSWORD=admin + +# ---------- GitHub Integration (Optional) ---------- +GITHUB_TOKEN= +GITHUB_OWNER=CCExtractor +GITHUB_REPOSITORY=ccextractor + +# ---------- Email / Mailgun (Optional) ---------- +EMAIL_DOMAIN= +EMAIL_API_KEY= + +# ---------- Feature Flags ---------- +# Populate the DB with sample categories, samples, and regression tests +INSTALL_SAMPLE_DATA=true +MAINTENANCE=false diff --git a/mod_sample/controllers.py b/mod_sample/controllers.py index 5d315f5c..4ad216b0 100755 --- a/mod_sample/controllers.py +++ b/mod_sample/controllers.py @@ -398,4 +398,4 @@ def delete_sample_additional(sample_id, additional_id): 'form': form } raise SampleNotFoundException(f"Extra file {additional_id} for sample {sample.id} not found") - raise SampleNotFoundException(f"Sample with id {sample_id} not found") + raise SampleNotFoundException(f"Sample with id {sample_id} not found") \ No newline at end of file diff --git a/utility.py b/utility.py index 96308e41..b0a58cab 100644 --- a/utility.py +++ b/utility.py @@ -10,7 +10,7 @@ import requests import werkzeug -from flask import abort, g, redirect, request +from flask import abort, g, redirect, request, send_file ROOT_DIR = path.dirname(path.abspath(__file__)) @@ -19,6 +19,10 @@ def serve_file_download(file_name, file_folder, file_sub_folder='') -> werkzeug. """ Serve file download by redirecting using Signed Download URLs. + Falls back to serving from local filesystem if GCS returns NotFound. + This enables Docker development environments where /repository is a plain + volume rather than a gcsfuse mount backed by GCS. + :param file_name: name of the file :type file_name: str :param file_folder: name of the folder @@ -31,15 +35,25 @@ def serve_file_download(file_name, file_folder, file_sub_folder='') -> werkzeug. from run import config, storage_client_bucket file_path = path.join(file_folder, file_sub_folder, file_name) - blob = storage_client_bucket.blob(file_path) - blob.content_disposition = f'attachment; filename="{file_name}"' - blob.patch() - url = blob.generate_signed_url( - version="v4", - expiration=timedelta(minutes=config.get('GCS_SIGNED_URL_EXPIRY_LIMIT', '')), - method="GET", - ) - return redirect(url) + + # Try GCS first (production path — /repository is a gcsfuse mount) + try: + blob = storage_client_bucket.blob(file_path) + blob.content_disposition = f'attachment; filename="{file_name}"' + blob.patch() + url = blob.generate_signed_url( + version="v4", + expiration=timedelta(minutes=config.get('GCS_SIGNED_URL_EXPIRY_LIMIT', '')), + method="GET", + ) + return redirect(url) + except Exception: + # GCS failed — fall back to local file serving (Docker dev environment) + local_path = path.join(config.get('SAMPLE_REPOSITORY', ''), file_path) + if path.isfile(local_path): + return send_file(local_path, as_attachment=True, download_name=file_name) + # File doesn't exist locally either — re-raise + raise def request_from_github(abort_code: int = 418) -> Callable: