MFlowCode · sbryngelson · Mar 9, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
@@ -9,11 +9,17 @@ cleanup() {
   if [ -n "${tail_pid:-}" ]; then
     kill "${tail_pid}" 2>/dev/null || true
   fi
-  # Cancel the SLURM job if the monitor is exiting due to an error
-  # (e.g., the CI runner is being killed). Don't cancel on success.
+  # Cancel the SLURM job only if it is still active in the scheduler.
+  # If the job already left the queue (squeue returns empty), it has finished
+  # and run_monitored_slurm_job.sh will recover via sacct — don't cancel it.
   if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then
-    echo "Monitor exiting abnormally — cancelling SLURM job $job_id"
-    scancel "$job_id" 2>/dev/null || true
+    active_state=$(squeue -j "$job_id" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || echo "")
+    if [ -n "$active_state" ]; then
+      echo "Monitor exiting abnormally — cancelling SLURM job $job_id (state: $active_state)"
+      scancel "$job_id" 2>/dev/null || true
+    else
+      echo "Monitor exiting abnormally — SLURM job $job_id already left queue, not cancelling"
+    fi
   fi
 }
 trap cleanup EXIT
@@ -56,9 +62,11 @@ get_job_state() {
 }
 
 # Check if a state is terminal (job is done, for better or worse)
+# PREEMPTED is intentionally excluded: with --requeue the job restarts under
+# the same job ID and we must keep monitoring rather than exiting early.
 is_terminal_state() {
   case "$1" in
-    COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|PREEMPTED|REVOKED)
+    COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|REVOKED)
       return 0 ;;
     *)
       return 1 ;;
@@ -74,7 +82,7 @@ while [ ! -f "$output_file" ]; do
   state=$(get_job_state "$job_id")
 
   case "$state" in
-    PENDING|CONFIGURING)
+    PENDING|CONFIGURING|PREEMPTED)
       unknown_count=0
       sleep 5
       ;;

@@ -21,6 +21,8 @@ case "$cluster" in
     *) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
 esac
 
+rm -rf build
+
 . ./mfc.sh load -c "$flag" -m g
 source .github/scripts/gpu-opts.sh
 

@@ -1,30 +1,13 @@
 #!/bin/bash
-# Provides retry_build(): 3-attempt loop with configurable cleanup.
-# Set RETRY_CLEAN_CMD to override cleanup (default: rm -rf build/staging build/install build/lock.yaml).
+# Provides retry_build(): 2-attempt loop.
+# On failure of attempt 1, nukes the entire build directory before attempt 2.
 # Set RETRY_VALIDATE_CMD to run a post-build validation; failure triggers a retry.
 # Usage: source .github/scripts/retry-build.sh
 #        retry_build ./mfc.sh build -j 8 --gpu acc
 
-# Try normal cleanup; if it fails, escalate to cache nuke.
-_retry_clean() {
-    local clean_cmd="$1"
-    if eval "$clean_cmd" 2>/dev/null; then
-        return 0
-    fi
-    echo "  Normal cleanup failed."
-    if type _cache_nuke > /dev/null 2>&1; then
-        echo "  Escalating to NFS cache nuke..."
-        _cache_nuke
-    else
-        echo "  _cache_nuke not available, best-effort rm."
-        rm -rf build/staging build/install build/lock.yaml 2>/dev/null || true
-    fi
-}
-
 retry_build() {
-    local clean_cmd="${RETRY_CLEAN_CMD:-rm -rf build/staging build/install build/lock.yaml}"
     local validate_cmd="${RETRY_VALIDATE_CMD:-}"
-    local max_attempts=3
+    local max_attempts=2
     local attempt=1
     while [ $attempt -le $max_attempts ]; do
         echo "Build attempt $attempt of $max_attempts..."
@@ -33,8 +16,8 @@ retry_build() {
                 if ! eval "$validate_cmd"; then
                     echo "Post-build validation failed on attempt $attempt."
                     if [ $attempt -lt $max_attempts ]; then
-                        echo "Cleaning and retrying in 5s..."
-                        _retry_clean "$clean_cmd"
+                        echo "  Nuking build directory before retry..."
+                        rm -rf build 2>/dev/null || true
                         sleep 5
                         attempt=$((attempt + 1))
                         continue
@@ -48,8 +31,8 @@ retry_build() {
             return 0
         fi
         if [ $attempt -lt $max_attempts ]; then
-            echo "Build failed on attempt $attempt. Retrying in 30s..."
-            _retry_clean "$clean_cmd"
+            echo "  Build failed — nuking build directory before retry..."
+            rm -rf build 2>/dev/null || true
             sleep 30
         else
             echo "Build failed after $max_attempts attempts."

@@ -25,8 +25,10 @@ if [ "$monitor_exit" -ne 0 ]; then
     echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..."
     # Give the SLURM epilog time to finalize if the job just finished
     sleep 30
-    final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN")
-    final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "")
+    final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || true)
+    final_state="${final_state:-UNKNOWN}"
+    final_exit=$(sacct -j "$job_id" -X --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || true)
+    final_exit="${final_exit:-}"
     echo "Final SLURM state=$final_state exit=$final_exit"
     if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then
         echo "SLURM job $job_id completed successfully despite monitor failure — continuing."

@@ -20,6 +20,31 @@ echo "=========================================="
 echo "Starting parallel benchmark jobs..."
 echo "=========================================="
 
+# For Phoenix GPU benchmarks, select a consistent GPU partition before launching
+# both parallel jobs so PR and master always land on the same GPU type.
+if [ "$device" = "gpu" ] && [ "$cluster" = "phoenix" ]; then
+    echo "Selecting Phoenix GPU partition for benchmark consistency..."
+    # Prefer older/smaller partitions first (rtx6000, l40s, v100) to leave
+    # large modern nodes (h200, h100, a100) free for production workloads.
+    # rtx6000 has the most nodes and gives the most consistent baselines.
+    BENCH_GPU_PARTITION=""
+    for part in gpu-rtx6000 gpu-l40s gpu-v100 gpu-h200 gpu-h100 gpu-a100; do
+        # || true: grep -c exits 1 on zero matches (or when sinfo returns no output
+        # for an unknown partition); suppress so set -euo pipefail doesn't abort.
+        idle=$(sinfo -p "$part" --noheader -o "%t" 2>/dev/null | grep -cE "^(idle|mix)" || true)
+        if [ "${idle:-0}" -gt 0 ]; then
+            BENCH_GPU_PARTITION="$part"
+            echo "Selected GPU partition: $BENCH_GPU_PARTITION ($idle idle/mix nodes)"
+            break
+        fi
+    done
+    if [ -z "$BENCH_GPU_PARTITION" ]; then
+        echo "WARNING: No idle GPU partition found; falling back to gpu-rtx6000 (may queue)"
+        BENCH_GPU_PARTITION="gpu-rtx6000"
+    fi
+    export BENCH_GPU_PARTITION
+fi
+
 # Run both jobs with monitoring using dedicated script from PR
 # Use stdbuf for line-buffered output and prefix each line for clarity
 (set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[PR] $line"; done) &
@@ -40,6 +65,8 @@ wait "$pr_pid"
 pr_exit=$?
 if [ "$pr_exit" -ne 0 ]; then
   echo "PR job exited with code: $pr_exit"
+  echo "Last 50 lines of PR job log:"
+  tail -n 50 "pr/bench-${device}-${interface}.out" 2>/dev/null || echo "  Could not read PR log"
 else
   echo "PR job completed successfully"
 fi
@@ -48,6 +75,8 @@ wait "$master_pid"
 master_exit=$?
 if [ "$master_exit" -ne 0 ]; then
   echo "Master job exited with code: $master_exit"
+  echo "Last 50 lines of master job log:"
+  tail -n 50 "master/bench-${device}-${interface}.out" 2>/dev/null || echo "  Could not read master log"
 else
   echo "Master job completed successfully"
 fi

@@ -14,12 +14,18 @@ device="$2"
 interface="$3"
 cluster="$4"
 
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
 echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
 cd "$dir"
 
-# Submit and monitor job (submit.sh auto-detects bench mode from script name)
-bash .github/workflows/$cluster/submit.sh \
-    .github/workflows/$cluster/bench.sh "$device" "$interface"
+# Always use the PR's submit.sh so both master and PR builds benefit from the
+# run_monitored_slurm_job.sh SIGKILL recovery wrapper.  The bench script is
+# still resolved relative to the current directory (master/ or pr/) so the
+# correct branch code is benchmarked.  SLURM_SUBMIT_DIR ensures the job runs
+# in the right directory regardless of which submit.sh is invoked.
+PR_SUBMIT="${SCRIPT_DIR}/../workflows/${cluster}/submit.sh"
+bash "$PR_SUBMIT" .github/workflows/$cluster/bench.sh "$device" "$interface"
 
 # Verify the YAML output file was created
 job_slug="bench-$device-$interface"

@@ -85,6 +85,7 @@ jobs:
             device: gpu
             interface: omp
             build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench"
+    continue-on-error: ${{ matrix.cluster == 'frontier' || matrix.cluster == 'frontier_amd' }}
     runs-on:
       group: ${{ matrix.group }}
       labels: ${{ matrix.labels }}
@@ -106,7 +107,7 @@ jobs:
         if: matrix.build_script != ''
         uses: nick-fields/retry@v3
         with:
-          max_attempts: 3
+          max_attempts: 2
           retry_wait_seconds: 60
           timeout_minutes: 150
           command: |
@@ -118,13 +119,20 @@ jobs:
             wait $pid2; e2=$?
             [ $e1 -eq 0 ] && [ $e2 -eq 0 ]
           on_retry_command: |
-            (cd pr     && ./mfc.sh clean) &
-            (cd master && ./mfc.sh clean) &
-            wait
+            rm -rf pr/build master/build
 
       - name: Bench (Master v. PR)
         run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
 
+      - name: Cancel SLURM Jobs
+        if: cancelled()
+        run: |
+          find . -name "*.slurm_job_id" | while read -r f; do
+            job_id=$(cat "$f")
+            echo "Cancelling SLURM job $job_id"
+            scancel "$job_id" 2>/dev/null || true
+          done
+
       - name: Generate & Post Comment
         if: always()
         run: |
@@ -137,6 +145,29 @@ jobs:
           cat pr/bench-${{ matrix.device }}-${{ matrix.interface }}.* 2>/dev/null || true
           cat master/bench-${{ matrix.device }}-${{ matrix.interface }}.* 2>/dev/null || true
 
+      - name: Print Per-Case Logs
+        if: always()
+        run: |
+          passed=() failed=()
+          for out in pr/build/benchmarks/*/*.out master/build/benchmarks/*/*.out; do
+            [ -f "$out" ] || continue
+            [ -f "${out%.out}.yaml" ] && passed+=("$out") || failed+=("$out")
+          done
+
+          echo "=== Per-Case Summary: ${#failed[@]} failed, ${#passed[@]} passed ==="
+          for out in "${failed[@]}"; do echo "  [FAILED] $out"; done
+          for out in "${passed[@]}"; do echo "  [PASSED] $out"; done
+
+          if [ ${#failed[@]} -gt 0 ]; then
+            echo ""
+            echo "=== Failed Case Logs ==="
+            for out in "${failed[@]}"; do
+              echo "--- $out ---"
+              cat "$out"
+              echo ""
+            done
+          fi
+
       # All other runners (non-Phoenix) just run without special env
       - name: Archive Logs (Frontier)
         if: always() && matrix.cluster != 'phoenix'

@@ -2,8 +2,11 @@
 
 source .github/scripts/bench-preamble.sh
 
+# Cap parallel jobs at 64 to avoid overwhelming MPI daemons on large nodes.
+n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) ))
+
 if [ "$job_device" = "gpu" ]; then
     ./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
 else
-    ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
+    ./mfc.sh bench --mem 1 -j $n_jobs -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
 fi