From cbd3a05c00b58e4c7af1beb311d1d01eebf6405f Mon Sep 17 00:00:00 2001 From: Alex Gherghisan Date: Wed, 26 Nov 2025 17:08:20 +0000 Subject: [PATCH 1/2] chore: add script to validate changes to network_config.json --- spartan/scripts/diff_network_config.sh | 124 ++++++++ spartan/scripts/pr_network_validate.sh | 399 +++++++++++++++++++++++++ 2 files changed, 523 insertions(+) create mode 100755 spartan/scripts/diff_network_config.sh create mode 100755 spartan/scripts/pr_network_validate.sh diff --git a/spartan/scripts/diff_network_config.sh b/spartan/scripts/diff_network_config.sh new file mode 100755 index 000000000000..da60cb5581a7 --- /dev/null +++ b/spartan/scripts/diff_network_config.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Diffs network config between main branch and a PR branch to extract only NEW resources +# Usage: diff_network_config.sh + +# Basic logging helpers +log() { echo "[INFO] $(date -Is) - $*" >&2; } +err() { echo "[ERROR] $(date -Is) - $*" >&2; } +die() { err "$*"; exit 1; } + +# Check arguments +if [[ $# -lt 2 ]]; then + die "Usage: $0 " +fi + +PR_NUMBER="$1" +NETWORK_NAME="$2" + +# URLs for fetching configs +MAIN_CONFIG_URL="https://raw.githubusercontent.com/AztecProtocol/networks/main/network_config.json" +PR_CONFIG_URL="https://raw.githubusercontent.com/AztecProtocol/networks/refs/pull/${PR_NUMBER}/merge/network_config.json" + +# Temporary files +TMP_DIR=$(mktemp -d) +trap "rm -rf $TMP_DIR" EXIT + +MAIN_CONFIG_FILE="${TMP_DIR}/main_config.json" +PR_CONFIG_FILE="${TMP_DIR}/pr_config.json" +DIFF_OUTPUT_FILE="${TMP_DIR}/diff_output.json" + +log "Fetching main branch network config..." +if ! curl -f -s -L "$MAIN_CONFIG_URL" -o "$MAIN_CONFIG_FILE"; then + die "Failed to fetch main branch config from $MAIN_CONFIG_URL" +fi + +log "Fetching PR #${PR_NUMBER} network config..." +if ! curl -f -s -L "$PR_CONFIG_URL" -o "$PR_CONFIG_FILE"; then + die "Failed to fetch PR config from $PR_CONFIG_URL" +fi + +# Validate network exists in both configs +if ! jq -e ".${NETWORK_NAME}" "$MAIN_CONFIG_FILE" >/dev/null 2>&1; then + die "Network '${NETWORK_NAME}' not found in main branch config" +fi + +if ! jq -e ".${NETWORK_NAME}" "$PR_CONFIG_FILE" >/dev/null 2>&1; then + die "Network '${NETWORK_NAME}' not found in PR config" +fi + +log "Extracting network configs for '${NETWORK_NAME}'..." +MAIN_NETWORK=$(jq ".${NETWORK_NAME}" "$MAIN_CONFIG_FILE") +PR_NETWORK=$(jq ".${NETWORK_NAME}" "$PR_CONFIG_FILE") + +# Extract arrays +MAIN_BOOTNODES=$(echo "$MAIN_NETWORK" | jq -r '.bootnodes[]' 2>/dev/null || echo "") +PR_BOOTNODES=$(echo "$PR_NETWORK" | jq -r '.bootnodes[]' 2>/dev/null || echo "") + +MAIN_SNAPSHOTS=$(echo "$MAIN_NETWORK" | jq -r '.snapshots[]' 2>/dev/null || echo "") +PR_SNAPSHOTS=$(echo "$PR_NETWORK" | jq -r '.snapshots[]' 2>/dev/null || echo "") + +# Find NEW bootnodes (in PR but not in main) +log "Diffing bootnodes..." +NEW_BOOTNODES_ARRAY="[]" +if [[ -n "$PR_BOOTNODES" ]]; then + while IFS= read -r pr_bootnode; do + if [[ -n "$pr_bootnode" ]]; then + # Check if this bootnode exists in main + if ! echo "$MAIN_BOOTNODES" | grep -Fxq "$pr_bootnode"; then + NEW_BOOTNODES_ARRAY=$(echo "$NEW_BOOTNODES_ARRAY" | jq --arg bn "$pr_bootnode" '. + [$bn]') + fi + fi + done <<< "$PR_BOOTNODES" +fi + +# Find NEW snapshots (in PR but not in main) +log "Diffing snapshots..." +NEW_SNAPSHOTS_ARRAY="[]" +if [[ -n "$PR_SNAPSHOTS" ]]; then + while IFS= read -r pr_snapshot; do + if [[ -n "$pr_snapshot" ]]; then + # Check if this snapshot exists in main + if ! echo "$MAIN_SNAPSHOTS" | grep -Fxq "$pr_snapshot"; then + NEW_SNAPSHOTS_ARRAY=$(echo "$NEW_SNAPSHOTS_ARRAY" | jq --arg snap "$pr_snapshot" '. + [$snap]') + fi + fi + done <<< "$PR_SNAPSHOTS" +fi + +# Extract other required fields from PR config +REGISTRY_ADDRESS=$(echo "$PR_NETWORK" | jq -r '.registryAddress') +L1_CHAIN_ID=$(echo "$PR_NETWORK" | jq -r '.l1ChainId') +FEE_ASSET_HANDLER_ADDRESS=$(echo "$PR_NETWORK" | jq -r '.feeAssetHandlerAddress // ""') + +# Validate at least one new resource exists +NEW_BOOTNODE_COUNT=$(echo "$NEW_BOOTNODES_ARRAY" | jq 'length') +NEW_SNAPSHOT_COUNT=$(echo "$NEW_SNAPSHOTS_ARRAY" | jq 'length') + +if [[ "$NEW_BOOTNODE_COUNT" -eq 0 ]] && [[ "$NEW_SNAPSHOT_COUNT" -eq 0 ]]; then + die "No new bootnodes or snapshots found in PR. Nothing to validate." +fi + +log "Found $NEW_BOOTNODE_COUNT new bootnode(s) and $NEW_SNAPSHOT_COUNT new snapshot(s)" + +# Build output JSON +jq -n \ + --argjson bootnodes "$NEW_BOOTNODES_ARRAY" \ + --argjson snapshots "$NEW_SNAPSHOTS_ARRAY" \ + --arg registry "$REGISTRY_ADDRESS" \ + --arg l1ChainId "$L1_CHAIN_ID" \ + --arg feeAssetHandler "$FEE_ASSET_HANDLER_ADDRESS" \ + '{ + new_bootnodes: $bootnodes, + new_snapshots: $snapshots, + registry_address: $registry, + l1_chain_id: $l1ChainId, + fee_asset_handler_address: $feeAssetHandler + }' > "$DIFF_OUTPUT_FILE" + +# Output the result to stdout +cat "$DIFF_OUTPUT_FILE" + +log "Config diff completed successfully" diff --git a/spartan/scripts/pr_network_validate.sh b/spartan/scripts/pr_network_validate.sh new file mode 100755 index 000000000000..44d7f72812b7 --- /dev/null +++ b/spartan/scripts/pr_network_validate.sh @@ -0,0 +1,399 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Validates a network config PR by deploying 2 nodes in Kubernetes using ONLY the new resources +# Usage: pr_network_validate.sh [cluster] + +echo "PR Network Config Validation" +echo "=============================" + +spartan=$(git rev-parse --show-toplevel)/spartan +scripts_dir=$spartan/scripts + +# Source required scripts +source "$scripts_dir/source_env_basic.sh" +source "$scripts_dir/gcp_auth.sh" + +# Basic logging helpers +log() { echo "[INFO] $(date -Is) - $*"; } +err() { echo "[ERROR] $(date -Is) - $*" >&2; } +die() { err "$*"; exit 1; } + +# Check arguments +if [[ $# -lt 2 ]]; then + die "Usage: $0 [cluster]" +fi + +PR_NUMBER="$1" +NETWORK_NAME="$2" +CLUSTER="${3:-${CLUSTER:-kind}}" + +# Configuration +NAMESPACE="pr-validate-${PR_NUMBER}" +RELEASE_PREFIX="pr-val-${PR_NUMBER}" +CLEANUP="${CLEANUP:-true}" +VALIDATION_TIMEOUT="${VALIDATION_TIMEOUT:-1800}" +AZTEC_DOCKER_IMAGE="${AZTEC_DOCKER_IMAGE:-aztecprotocol/aztec:latest}" + +# Check required environment variables +if [[ -z "${AZTEC_DOCKER_IMAGE:-}" ]]; then + die "AZTEC_DOCKER_IMAGE is not set. Please set it to the image you want to validate." +fi + +log "Configuration:" +log " PR Number: ${PR_NUMBER}" +log " Network: ${NETWORK_NAME}" +log " Cluster: ${CLUSTER}" +log " Namespace: ${NAMESPACE}" +log " Image: ${AZTEC_DOCKER_IMAGE}" + +# Perform GCP auth if not using kind +if [[ "${CLUSTER}" != "kind" ]]; then + log "Authenticating to GCP..." + gcp_auth +fi + +# Get kubectl context +K8S_CLUSTER_CONTEXT=$(kubectl config current-context) +log "Using kubectl context: ${K8S_CLUSTER_CONTEXT}" + +# Step 1: Diff configs to get new resources +log "Step 1: Diffing network configs..." +DIFF_OUTPUT=$("$scripts_dir/diff_network_config.sh" "$PR_NUMBER" "$NETWORK_NAME") + +if [[ -z "$DIFF_OUTPUT" ]]; then + die "Failed to diff network configs" +fi + +log "Diff output:" +echo "$DIFF_OUTPUT" | jq '.' + +# Parse diff output +NEW_BOOTNODES=$(echo "$DIFF_OUTPUT" | jq -r '.new_bootnodes | join(",")') +NEW_SNAPSHOTS=$(echo "$DIFF_OUTPUT" | jq -r '.new_snapshots | join(",")') +REGISTRY_ADDRESS=$(echo "$DIFF_OUTPUT" | jq -r '.registry_address') +L1_CHAIN_ID=$(echo "$DIFF_OUTPUT" | jq -r '.l1_chain_id') +FEE_ASSET_HANDLER_ADDRESS=$(echo "$DIFF_OUTPUT" | jq -r '.fee_asset_handler_address // ""') + +log "New resources to validate:" +log " Bootnodes: ${NEW_BOOTNODES:-}" +log " Snapshots: ${NEW_SNAPSHOTS:-}" + +# Validate at least one new resource +if [[ -z "$NEW_BOOTNODES" ]] && [[ -z "$NEW_SNAPSHOTS" ]]; then + die "No new bootnodes or snapshots to validate" +fi + +# Step 2: Create namespace +log "Step 2: Creating namespace ${NAMESPACE}..." +if kubectl get namespace "${NAMESPACE}" >/dev/null 2>&1; then + log "Namespace ${NAMESPACE} already exists. Deleting..." + kubectl delete namespace "${NAMESPACE}" --wait=true --timeout=60s || true +fi +kubectl create namespace "${NAMESPACE}" + +# Step 3: Get L1 configuration +log "Step 3: Setting up L1 configuration..." + +# Determine L1 network based on the Aztec network +# mainnet uses mainnet L1, everything else uses sepolia +if [[ "$NETWORK_NAME" == "mainnet" ]]; then + L1_NETWORK="mainnet" +else + L1_NETWORK="sepolia" +fi + +# Fetch L1 endpoints from GCP secrets if not already set +if [[ -z "${ETHEREUM_RPC_URLS:-}" ]] && [[ -z "${L1_RPC_URLS:-}" ]]; then + log "Fetching L1 RPC URLs from GCP secret: ${L1_NETWORK}-rpc-urls" + ETHEREUM_RPC_URLS=$(gcloud secrets versions access latest --secret="${L1_NETWORK}-rpc-urls" --project="${GCP_PROJECT_ID:-testnet-440309}" 2>/dev/null || echo "") + if [[ -z "$ETHEREUM_RPC_URLS" ]]; then + die "Failed to fetch ${L1_NETWORK}-rpc-urls from GCP secrets. Ensure you're authenticated." + fi +fi + +if [[ -z "${ETHEREUM_CONSENSUS_HOST_URLS:-}" ]] && [[ -z "${L1_CONSENSUS_HOST_URLS:-}" ]]; then + log "Fetching L1 Consensus URLs from GCP secret: ${L1_NETWORK}-consensus-host-urls" + ETHEREUM_CONSENSUS_HOST_URLS=$(gcloud secrets versions access latest --secret="${L1_NETWORK}-consensus-host-urls" --project="${GCP_PROJECT_ID:-testnet-440309}" 2>/dev/null || echo "") + if [[ -z "$ETHEREUM_CONSENSUS_HOST_URLS" ]]; then + die "Failed to fetch ${L1_NETWORK}-consensus-host-urls from GCP secrets. Ensure you're authenticated." + fi +fi + +# L1 endpoints - required for nodes to sync +L1_RPC_URLS="${ETHEREUM_RPC_URLS:-${L1_RPC_URLS:-}}" +L1_CONSENSUS_URLS="${ETHEREUM_CONSENSUS_HOST_URLS:-${L1_CONSENSUS_HOST_URLS:-}}" + +if [[ -z "$L1_RPC_URLS" ]]; then + die "L1_RPC_URLS or ETHEREUM_RPC_URLS must be set" +fi + +if [[ -z "$L1_CONSENSUS_URLS" ]]; then + die "L1_CONSENSUS_URLS or ETHEREUM_CONSENSUS_HOST_URLS must be set" +fi + +# Convert JSON arrays to comma-separated if needed +if [[ "$L1_RPC_URLS" == "["* ]]; then + L1_RPC_URLS=$(echo "$L1_RPC_URLS" | jq -r 'join(",")') +fi + +if [[ "$L1_CONSENSUS_URLS" == "["* ]]; then + L1_CONSENSUS_URLS=$(echo "$L1_CONSENSUS_URLS" | jq -r 'join(",")') +fi + +log " L1 RPC URLs: ${L1_RPC_URLS}" +log " L1 Consensus URLs: ${L1_CONSENSUS_URLS}" + +# Step 4: Create Helm values file +log "Step 4: Creating Helm values..." +TMP_DIR=$(mktemp -d) +trap "rm -rf $TMP_DIR" EXIT + +HELM_VALUES_FILE="${TMP_DIR}/pr-validate-values.yaml" + +cat > "$HELM_VALUES_FILE" << EOF +replicaCount: 2 + +global: + aztecNetwork: "" # Don't use predefined network + customAztecNetwork: + l1ChainId: ${L1_CHAIN_ID} + registryContractAddress: "${REGISTRY_ADDRESS}" + feeAssetHandlerContractAddress: "${FEE_ASSET_HANDLER_ADDRESS}" + + l1ExecutionUrls: +$(echo "$L1_RPC_URLS" | tr ',' '\n' | while read url; do echo " - \"$url\""; done) + + l1ConsensusUrls: +$(echo "$L1_CONSENSUS_URLS" | tr ',' '\n' | while read url; do echo " - \"$url\""; done) + + aztecImage: + repository: $(echo "$AZTEC_DOCKER_IMAGE" | cut -d: -f1) + tag: "$(echo "$AZTEC_DOCKER_IMAGE" | cut -d: -f2)" + pullPolicy: IfNotPresent + + sponsoredFPC: false + testAccounts: false + +node: + logLevel: "debug" + + startCmd: + - --node + - --archiver + + env: + # Override with ONLY new resources - this is the key to isolated testing + NETWORK_CONFIG_LOCATION: "" # Disable remote network config fetch + BOOTSTRAP_NODES: "${NEW_BOOTNODES}" + SYNC_SNAPSHOTS_URLS: "${NEW_SNAPSHOTS}" + SYNC_MODE: force-snapshot + L1_CHAIN_ID: "${L1_CHAIN_ID}" + REGISTRY_CONTRACT_ADDRESS: "${REGISTRY_ADDRESS}" + FEE_ASSET_HANDLER_CONTRACT_ADDRESS: "${FEE_ASSET_HANDLER_ADDRESS:-0x0000000000000000000000000000000000000000}" + LOG_LEVEL: "debug" +EOF + +log "Helm values created:" +cat "$HELM_VALUES_FILE" + +# Step 5: Deploy with Helm +log "Step 5: Deploying validation nodes..." +helm upgrade --install \ + "${RELEASE_PREFIX}" \ + "$spartan/aztec-node" \ + --namespace "${NAMESPACE}" \ + --values "$HELM_VALUES_FILE" \ + --timeout 15m + +log "Deployment complete. Waiting for pods to start..." + +# Step 6: Wait for pods to exist (not Ready - we need to check logs immediately) +log "Step 6: Waiting for pods to be created..." +for i in {1..60}; do + POD_COUNT=$(kubectl get pods -n "${NAMESPACE}" -l "app.kubernetes.io/instance=${RELEASE_PREFIX}" --no-headers 2>/dev/null | wc -l) + if [[ "$POD_COUNT" -ge 2 ]]; then + log "Pods created!" + break + fi + if [[ $i -eq 60 ]]; then + die "Timeout waiting for pods to be created" + fi + sleep 2 +done + +# Get pod names +POD_0=$(kubectl get pods -n "${NAMESPACE}" -l "app.kubernetes.io/instance=${RELEASE_PREFIX}" -o jsonpath='{.items[0].metadata.name}') +POD_1=$(kubectl get pods -n "${NAMESPACE}" -l "app.kubernetes.io/instance=${RELEASE_PREFIX}" -o jsonpath='{.items[1].metadata.name}') + +log "Validation pods: ${POD_0}, ${POD_1}" +log "Starting validation checks (will monitor logs as pods start)..." + +# Step 7: Validation checks +log "Step 7: Running validation checks..." + +VALIDATION_START=$(date +%s) +SUCCESS=false + +# Function to check logs for P2P connection +check_p2p_connection() { + local pod=$1 + kubectl logs -n "${NAMESPACE}" "$pod" --tail=100 2>/dev/null | grep -i "peer.*connected\|discovered peer" || true +} + +# Function to check logs for snapshot download +check_snapshot_download() { + local pod=$1 + kubectl logs -n "${NAMESPACE}" "$pod" --tail=100 2>/dev/null | grep -i "snapshot.*download\|syncing from snapshot\|downloading snapshot" || true +} + +# Function to check if node is syncing from L1 (fallback - should fail if snapshots expected) +check_l1_sync() { + local pod=$1 + kubectl logs -n "${NAMESPACE}" "$pod" --tail=200 2>/dev/null | grep -i "syncing from l1\|catching up from l1\|starting archiver\|archiver sync" || true +} + +# Function to check for snapshot failures +check_snapshot_failure() { + local pod=$1 + # Look for the critical failure message that means all snapshots failed + kubectl logs -n "${NAMESPACE}" "$pod" --tail=200 2>/dev/null | grep -i "No valid snapshots found from any URL, skipping snapshot sync\|No snapshot found at.*Skipping this URL\|Fetching.*failed\. Will retry" || true +} + +# Function to check node status +check_node_status() { + local pod=$1 + kubectl exec -n "${NAMESPACE}" "$pod" -- curl -s http://localhost:8080/status || echo "{}" +} + +log "Monitoring validation (timeout: ${VALIDATION_TIMEOUT}s)..." + +while true; do + CURRENT_TIME=$(date +%s) + ELAPSED=$((CURRENT_TIME - VALIDATION_START)) + + if [[ $ELAPSED -gt $VALIDATION_TIMEOUT ]]; then + err "Validation timeout reached (${VALIDATION_TIMEOUT}s)" + break + fi + + log "Check iteration (${ELAPSED}s elapsed)..." + + # CRITICAL CHECK: If new snapshots exist, ensure nodes are NOT syncing from L1 + if [[ -n "$NEW_SNAPSHOTS" ]]; then + log " Checking for L1 sync fallback (should NOT happen with new snapshots)..." + L1_SYNC_POD_0=$(check_l1_sync "$POD_0") + L1_SYNC_POD_1=$(check_l1_sync "$POD_1") + + if [[ -n "$L1_SYNC_POD_0" ]] || [[ -n "$L1_SYNC_POD_1" ]]; then + err " ✗ FAILURE: Nodes are syncing from L1 instead of using snapshots!" + if [[ -n "$L1_SYNC_POD_0" ]]; then + err " Pod 0: $L1_SYNC_POD_0" + fi + if [[ -n "$L1_SYNC_POD_1" ]]; then + err " Pod 1: $L1_SYNC_POD_1" + fi + err " This indicates the snapshot URL is broken or unreachable." + SUCCESS=false + break + fi + + # Check for explicit snapshot failures + SNAPSHOT_FAIL_POD_0=$(check_snapshot_failure "$POD_0") + SNAPSHOT_FAIL_POD_1=$(check_snapshot_failure "$POD_1") + + if [[ -n "$SNAPSHOT_FAIL_POD_0" ]] || [[ -n "$SNAPSHOT_FAIL_POD_1" ]]; then + err " ✗ FAILURE: Snapshot download failed!" + if [[ -n "$SNAPSHOT_FAIL_POD_0" ]]; then + err " Pod 0: $SNAPSHOT_FAIL_POD_0" + fi + if [[ -n "$SNAPSHOT_FAIL_POD_1" ]]; then + err " Pod 1: $SNAPSHOT_FAIL_POD_1" + fi + SUCCESS=false + break + fi + fi + + # Check 1: P2P discovery + log " Checking P2P connections..." + P2P_POD_0=$(check_p2p_connection "$POD_0") + P2P_POD_1=$(check_p2p_connection "$POD_1") + + if [[ -n "$P2P_POD_0" ]] || [[ -n "$P2P_POD_1" ]]; then + log " ✓ P2P connection detected!" + if [[ -n "$P2P_POD_0" ]]; then + log " Pod 0: $P2P_POD_0" + fi + if [[ -n "$P2P_POD_1" ]]; then + log " Pod 1: $P2P_POD_1" + fi + + # If we only have new snapshots (no new bootnodes), P2P is not required + if [[ -n "$NEW_BOOTNODES" ]]; then + SUCCESS=true + break + fi + fi + + # Check 2: Snapshot download (if new snapshots exist) + if [[ -n "$NEW_SNAPSHOTS" ]]; then + log " Checking snapshot downloads..." + SNAPSHOT_POD_0=$(check_snapshot_download "$POD_0") + SNAPSHOT_POD_1=$(check_snapshot_download "$POD_1") + + if [[ -n "$SNAPSHOT_POD_0" ]] || [[ -n "$SNAPSHOT_POD_1" ]]; then + log " ✓ Snapshot download detected!" + if [[ -n "$SNAPSHOT_POD_0" ]]; then + log " Pod 0: $SNAPSHOT_POD_0" + fi + if [[ -n "$SNAPSHOT_POD_1" ]]; then + log " Pod 1: $SNAPSHOT_POD_1" + fi + SUCCESS=true + break + fi + fi + + log " Waiting 10s before next check..." + sleep 10 +done + +# Step 8: Report results +log "Step 8: Validation complete" + +if [[ "$SUCCESS" == "true" ]]; then + log "✓ VALIDATION PASSED" + log " - New bootnodes: ${NEW_BOOTNODES:-}" + log " - New snapshots: ${NEW_SNAPSHOTS:-}" + log " - Nodes successfully used new resources" +else + err "✗ VALIDATION FAILED" + err " - Could not verify nodes are using new resources" + err " - Check logs below for details" + + # Dump logs for debugging + log "Pod 0 logs (last 50 lines):" + kubectl logs -n "${NAMESPACE}" "$POD_0" --tail=50 || true + + log "Pod 1 logs (last 50 lines):" + kubectl logs -n "${NAMESPACE}" "$POD_1" --tail=50 || true +fi + +# Step 9: Cleanup +if [[ "$CLEANUP" == "true" ]]; then + log "Step 9: Cleaning up namespace ${NAMESPACE}..." + kubectl delete namespace "${NAMESPACE}" --wait=true --timeout=60s || true + log "Cleanup complete" +else + log "Step 9: Skipping cleanup (CLEANUP=false)" + log " To clean up manually: kubectl delete namespace ${NAMESPACE}" +fi + +# Exit with appropriate code +if [[ "$SUCCESS" == "true" ]]; then + exit 0 +else + exit 1 +fi From 35490ba1878339c7c3fc7000aa93b954fcd330b2 Mon Sep 17 00:00:00 2001 From: Alex Gherghisan Date: Thu, 27 Nov 2025 13:17:54 +0000 Subject: [PATCH 2/2] chore: update script --- spartan/scripts/pr_network_validate.sh | 235 +++++++++++++------------ 1 file changed, 125 insertions(+), 110 deletions(-) diff --git a/spartan/scripts/pr_network_validate.sh b/spartan/scripts/pr_network_validate.sh index 44d7f72812b7..3cb4b8f66cfb 100755 --- a/spartan/scripts/pr_network_validate.sh +++ b/spartan/scripts/pr_network_validate.sh @@ -58,8 +58,7 @@ fi K8S_CLUSTER_CONTEXT=$(kubectl config current-context) log "Using kubectl context: ${K8S_CLUSTER_CONTEXT}" -# Step 1: Diff configs to get new resources -log "Step 1: Diffing network configs..." +log "Diffing network configs..." DIFF_OUTPUT=$("$scripts_dir/diff_network_config.sh" "$PR_NUMBER" "$NETWORK_NAME") if [[ -z "$DIFF_OUTPUT" ]]; then @@ -85,16 +84,14 @@ if [[ -z "$NEW_BOOTNODES" ]] && [[ -z "$NEW_SNAPSHOTS" ]]; then die "No new bootnodes or snapshots to validate" fi -# Step 2: Create namespace -log "Step 2: Creating namespace ${NAMESPACE}..." +log "Creating namespace ${NAMESPACE}..." if kubectl get namespace "${NAMESPACE}" >/dev/null 2>&1; then log "Namespace ${NAMESPACE} already exists. Deleting..." kubectl delete namespace "${NAMESPACE}" --wait=true --timeout=60s || true fi kubectl create namespace "${NAMESPACE}" -# Step 3: Get L1 configuration -log "Step 3: Setting up L1 configuration..." +log "Setting up L1 configuration..." # Determine L1 network based on the Aztec network # mainnet uses mainnet L1, everything else uses sepolia @@ -145,8 +142,7 @@ fi log " L1 RPC URLs: ${L1_RPC_URLS}" log " L1 Consensus URLs: ${L1_CONSENSUS_URLS}" -# Step 4: Create Helm values file -log "Step 4: Creating Helm values..." +log "Creating Helm values..." TMP_DIR=$(mktemp -d) trap "rm -rf $TMP_DIR" EXIT @@ -183,6 +179,13 @@ node: - --node - --archiver + # Enable P2P with node port to get public IP + p2p: + enabled: true + publicIP: true + port: 40400 + announcePort: 40400 + env: # Override with ONLY new resources - this is the key to isolated testing NETWORK_CONFIG_LOCATION: "" # Disable remote network config fetch @@ -198,8 +201,7 @@ EOF log "Helm values created:" cat "$HELM_VALUES_FILE" -# Step 5: Deploy with Helm -log "Step 5: Deploying validation nodes..." +log "Deploying validation nodes..." helm upgrade --install \ "${RELEASE_PREFIX}" \ "$spartan/aztec-node" \ @@ -209,8 +211,7 @@ helm upgrade --install \ log "Deployment complete. Waiting for pods to start..." -# Step 6: Wait for pods to exist (not Ready - we need to check logs immediately) -log "Step 6: Waiting for pods to be created..." +log "Waiting for pods to be created..." for i in {1..60}; do POD_COUNT=$(kubectl get pods -n "${NAMESPACE}" -l "app.kubernetes.io/instance=${RELEASE_PREFIX}" --no-headers 2>/dev/null | wc -l) if [[ "$POD_COUNT" -ge 2 ]]; then @@ -230,35 +231,74 @@ POD_1=$(kubectl get pods -n "${NAMESPACE}" -l "app.kubernetes.io/instance=${RELE log "Validation pods: ${POD_0}, ${POD_1}" log "Starting validation checks (will monitor logs as pods start)..." -# Step 7: Validation checks -log "Step 7: Running validation checks..." +log "Running validation checks..." VALIDATION_START=$(date +%s) -SUCCESS=false +P2P_SUCCESS=false +SNAPSHOT_SUCCESS=false + +# Determine what we need to validate +NEED_P2P_CHECK=false +NEED_SNAPSHOT_CHECK=false + +if [[ -n "$NEW_BOOTNODES" ]]; then + NEED_P2P_CHECK=true + log "Will validate P2P discovery via new bootnode(s)" +fi + +if [[ -n "$NEW_SNAPSHOTS" ]]; then + NEED_SNAPSHOT_CHECK=true + log "Will validate snapshot download from new URL(s)" +fi # Function to check logs for P2P connection check_p2p_connection() { local pod=$1 - kubectl logs -n "${NAMESPACE}" "$pod" --tail=100 2>/dev/null | grep -i "peer.*connected\|discovered peer" || true + # Look for actual peer connections, not "Connected to 0 peers" + # Match patterns like: + # - "Connected to X peers" where X > 0 + # - "peer abc123 connected" + # - "discovered peer xyz789" + local logs=$(kubectl logs -n "${NAMESPACE}" "$pod" --tail=200 2>/dev/null) + + # Check for "Connected to X peers" where X > 0 + if echo "$logs" | grep -qE "Connected to [1-9][0-9]* peer"; then + echo "$logs" | grep -E "Connected to [1-9][0-9]* peer" + return 0 + fi + + # Check for specific peer connection messages (but exclude "Connected to 0 peers") + echo "$logs" | grep -iE "(peer [a-zA-Z0-9]+ connected|discovered peer [a-zA-Z0-9]+|connection established.*peer)" | grep -v "Connected to 0 peer" || true } # Function to check logs for snapshot download +# Returns: "success", "failed", or "unknown" check_snapshot_download() { local pod=$1 - kubectl logs -n "${NAMESPACE}" "$pod" --tail=100 2>/dev/null | grep -i "snapshot.*download\|syncing from snapshot\|downloading snapshot" || true -} + local logs=$(kubectl logs -n "${NAMESPACE}" "$pod" --tail=200 2>/dev/null) -# Function to check if node is syncing from L1 (fallback - should fail if snapshots expected) -check_l1_sync() { - local pod=$1 - kubectl logs -n "${NAMESPACE}" "$pod" --tail=200 2>/dev/null | grep -i "syncing from l1\|catching up from l1\|starting archiver\|archiver sync" || true -} + # Check for failure messages first + if echo "$logs" | grep -qi "No valid snapshots found from any URL, skipping snapshot sync"; then + echo "failed" + return + fi -# Function to check for snapshot failures -check_snapshot_failure() { - local pod=$1 - # Look for the critical failure message that means all snapshots failed - kubectl logs -n "${NAMESPACE}" "$pod" --tail=200 2>/dev/null | grep -i "No valid snapshots found from any URL, skipping snapshot sync\|No snapshot found at.*Skipping this URL\|Fetching.*failed\. Will retry" || true + if echo "$logs" | grep -qi "No snapshot found at.*Skipping this URL"; then + # This might be trying multiple URLs, check if ALL failed + if echo "$logs" | grep -qi "No valid snapshots found from any URL"; then + echo "failed" + return + fi + fi + + # Check for success messages + if echo "$logs" | grep -qi "snapshot.*download\|syncing from snapshot\|downloading snapshot"; then + echo "success" + return + fi + + # Neither success nor failure detected yet + echo "unknown" } # Function to check node status @@ -280,119 +320,94 @@ while true; do log "Check iteration (${ELAPSED}s elapsed)..." - # CRITICAL CHECK: If new snapshots exist, ensure nodes are NOT syncing from L1 - if [[ -n "$NEW_SNAPSHOTS" ]]; then - log " Checking for L1 sync fallback (should NOT happen with new snapshots)..." - L1_SYNC_POD_0=$(check_l1_sync "$POD_0") - L1_SYNC_POD_1=$(check_l1_sync "$POD_1") + if [[ "$NEED_SNAPSHOT_CHECK" == "true" ]] && [[ "$SNAPSHOT_SUCCESS" == "false" ]]; then + log " Checking snapshot downloads..." + SNAPSHOT_STATUS_POD_0=$(check_snapshot_download "$POD_0") + SNAPSHOT_STATUS_POD_1=$(check_snapshot_download "$POD_1") - if [[ -n "$L1_SYNC_POD_0" ]] || [[ -n "$L1_SYNC_POD_1" ]]; then - err " ✗ FAILURE: Nodes are syncing from L1 instead of using snapshots!" - if [[ -n "$L1_SYNC_POD_0" ]]; then - err " Pod 0: $L1_SYNC_POD_0" + if [[ "$SNAPSHOT_STATUS_POD_0" == "failed" ]] || [[ "$SNAPSHOT_STATUS_POD_1" == "failed" ]]; then + err " ✗ FAILURE: Snapshot download failed!" + if [[ "$SNAPSHOT_STATUS_POD_0" == "failed" ]]; then + err " Pod 0: Snapshot failed" + kubectl logs -n "${NAMESPACE}" "$POD_0" --tail=50 2>/dev/null | grep -i "snapshot\|No valid snapshots" | while IFS= read -r line; do + err " $line" + done fi - if [[ -n "$L1_SYNC_POD_1" ]]; then - err " Pod 1: $L1_SYNC_POD_1" + if [[ "$SNAPSHOT_STATUS_POD_1" == "failed" ]]; then + err " Pod 1: Snapshot failed" + kubectl logs -n "${NAMESPACE}" "$POD_1" --tail=50 2>/dev/null | grep -i "snapshot\|No valid snapshots" | while IFS= read -r line; do + err " $line" + done fi - err " This indicates the snapshot URL is broken or unreachable." - SUCCESS=false break fi - # Check for explicit snapshot failures - SNAPSHOT_FAIL_POD_0=$(check_snapshot_failure "$POD_0") - SNAPSHOT_FAIL_POD_1=$(check_snapshot_failure "$POD_1") - - if [[ -n "$SNAPSHOT_FAIL_POD_0" ]] || [[ -n "$SNAPSHOT_FAIL_POD_1" ]]; then - err " ✗ FAILURE: Snapshot download failed!" - if [[ -n "$SNAPSHOT_FAIL_POD_0" ]]; then - err " Pod 0: $SNAPSHOT_FAIL_POD_0" + if [[ "$SNAPSHOT_STATUS_POD_0" == "success" ]] || [[ "$SNAPSHOT_STATUS_POD_1" == "success" ]]; then + log " ✓ Snapshot download detected!" + if [[ "$SNAPSHOT_STATUS_POD_0" == "success" ]]; then + log " Pod 0: Snapshot download started" + kubectl logs -n "${NAMESPACE}" "$POD_0" --tail=50 2>/dev/null | grep -i "snapshot.*download\|syncing from snapshot" | while IFS= read -r line; do + log " $line" + done fi - if [[ -n "$SNAPSHOT_FAIL_POD_1" ]]; then - err " Pod 1: $SNAPSHOT_FAIL_POD_1" + if [[ "$SNAPSHOT_STATUS_POD_1" == "success" ]]; then + log " Pod 1: Snapshot download started" + kubectl logs -n "${NAMESPACE}" "$POD_1" --tail=50 2>/dev/null | grep -i "snapshot.*download\|syncing from snapshot" | while IFS= read -r line; do + log " $line" + done fi - SUCCESS=false - break + SNAPSHOT_SUCCESS=true fi - fi - - # Check 1: P2P discovery - log " Checking P2P connections..." - P2P_POD_0=$(check_p2p_connection "$POD_0") - P2P_POD_1=$(check_p2p_connection "$POD_1") - if [[ -n "$P2P_POD_0" ]] || [[ -n "$P2P_POD_1" ]]; then - log " ✓ P2P connection detected!" - if [[ -n "$P2P_POD_0" ]]; then - log " Pod 0: $P2P_POD_0" - fi - if [[ -n "$P2P_POD_1" ]]; then - log " Pod 1: $P2P_POD_1" - fi - - # If we only have new snapshots (no new bootnodes), P2P is not required - if [[ -n "$NEW_BOOTNODES" ]]; then - SUCCESS=true - break + if [[ "$SNAPSHOT_STATUS_POD_0" == "unknown" ]] && [[ "$SNAPSHOT_STATUS_POD_1" == "unknown" ]]; then + log " Snapshot status: still waiting..." fi fi - # Check 2: Snapshot download (if new snapshots exist) - if [[ -n "$NEW_SNAPSHOTS" ]]; then - log " Checking snapshot downloads..." - SNAPSHOT_POD_0=$(check_snapshot_download "$POD_0") - SNAPSHOT_POD_1=$(check_snapshot_download "$POD_1") - if [[ -n "$SNAPSHOT_POD_0" ]] || [[ -n "$SNAPSHOT_POD_1" ]]; then - log " ✓ Snapshot download detected!" - if [[ -n "$SNAPSHOT_POD_0" ]]; then - log " Pod 0: $SNAPSHOT_POD_0" + if [[ "$NEED_P2P_CHECK" == "true" ]] && [[ "$P2P_SUCCESS" == "false" ]]; then + log " Checking P2P connections..." + P2P_POD_0=$(check_p2p_connection "$POD_0") + P2P_POD_1=$(check_p2p_connection "$POD_1") + + if [[ -n "$P2P_POD_0" ]] || [[ -n "$P2P_POD_1" ]]; then + log " ✓ P2P connection detected!" + if [[ -n "$P2P_POD_0" ]]; then + log " Pod 0 logs:" + echo "$P2P_POD_0" | while IFS= read -r line; do + log " $line" + done fi - if [[ -n "$SNAPSHOT_POD_1" ]]; then - log " Pod 1: $SNAPSHOT_POD_1" + if [[ -n "$P2P_POD_1" ]]; then + log " Pod 1 logs:" + echo "$P2P_POD_1" | while IFS= read -r line; do + log " $line" + done fi - SUCCESS=true - break + P2P_SUCCESS=true fi fi + if [[ "$NEED_P2P_CHECK" == "false" || "$P2P_SUCCESS" == "true" ]] && [[ "$NEED_SNAPSHOT_CHECK" == "false" || "$SNAPSHOT_SUCCESS" == "true" ]]; then + log " ✓ All validation checks passed!" + break + fi + log " Waiting 10s before next check..." sleep 10 done -# Step 8: Report results -log "Step 8: Validation complete" - -if [[ "$SUCCESS" == "true" ]]; then - log "✓ VALIDATION PASSED" - log " - New bootnodes: ${NEW_BOOTNODES:-}" - log " - New snapshots: ${NEW_SNAPSHOTS:-}" - log " - Nodes successfully used new resources" -else - err "✗ VALIDATION FAILED" - err " - Could not verify nodes are using new resources" - err " - Check logs below for details" - - # Dump logs for debugging - log "Pod 0 logs (last 50 lines):" - kubectl logs -n "${NAMESPACE}" "$POD_0" --tail=50 || true - - log "Pod 1 logs (last 50 lines):" - kubectl logs -n "${NAMESPACE}" "$POD_1" --tail=50 || true -fi -# Step 9: Cleanup if [[ "$CLEANUP" == "true" ]]; then - log "Step 9: Cleaning up namespace ${NAMESPACE}..." + log "Cleaning up namespace ${NAMESPACE}..." kubectl delete namespace "${NAMESPACE}" --wait=true --timeout=60s || true log "Cleanup complete" else - log "Step 9: Skipping cleanup (CLEANUP=false)" + log "Skipping cleanup (CLEANUP=false)" log " To clean up manually: kubectl delete namespace ${NAMESPACE}" fi -# Exit with appropriate code -if [[ "$SUCCESS" == "true" ]]; then +if [[ "$NEED_P2P_CHECK" == "false" || "$P2P_SUCCESS" == "true" ]] && [[ "$NEED_SNAPSHOT_CHECK" == "false" || "$SNAPSHOT_SUCCESS" == "true" ]]; then exit 0 else exit 1