Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
9163a11
Fix Frontier benchmark SLURM: use batch+1:59+normal QOS
Mar 6, 2026
ffe80ec
Fix bench.yml: restore timeout-minutes to 480 (revert accidental 240)
Mar 6, 2026
cfbc023
Remove persistent build cache for self-hosted test runners
sbryngelson Mar 6, 2026
5742030
Remove build cache from benchmark jobs on Phoenix and Frontier
sbryngelson Mar 6, 2026
7edb7c3
Fix submit.sh to survive monitor SIGKILL by re-checking SLURM state
sbryngelson Mar 6, 2026
773f5ad
Extract monitor SIGKILL recovery into shared run_monitored_slurm_job.sh
sbryngelson Mar 6, 2026
1311cbe
Reduce benchmark steps and switch Frontier bench to batch/normal QOS
sbryngelson Mar 5, 2026
644c9e4
Cap bench script parallelism at 64 to fix GNR node failures
sbryngelson Mar 3, 2026
a02f4b2
Disable AVX-512 FP16 to fix build on Granite Rapids nodes
sbryngelson Mar 3, 2026
ba91673
Fix Rich MarkupError crash when build output contains bracket paths
sbryngelson Mar 2, 2026
438627e
Merge branch 'master' into fix/ci-robustness
sbryngelson Mar 6, 2026
3e773ff
Address bot review comments: sacct -X flag, dead job_type var, stale …
Mar 6, 2026
fae2e6a
Fix bench: use PR's submit.sh for master job to get SIGKILL recovery
sbryngelson Mar 6, 2026
3224931
Fix submit_and_monitor_bench.sh: define SCRIPT_DIR before use
sbryngelson Mar 6, 2026
2887def
bench: update Phoenix tmpbuild path to project storage
sbryngelson Mar 7, 2026
1e4f984
Fix bench timeout (240→480) and monitor scancel defeating sacct recovery
sbryngelson Mar 7, 2026
5886f2a
Fix sacct empty-output edge case in run_monitored_slurm_job.sh
sbryngelson Mar 7, 2026
0551dea
bench: dynamic Phoenix GPU partition, per-case logs, downgrade grind …
sbryngelson Mar 8, 2026
16e0f76
bench: address code review findings in GPU partition selection
sbryngelson Mar 8, 2026
b396a1c
ci: add gpu-h200 partition to Phoenix test and case-optimization GPU …
sbryngelson Mar 8, 2026
7e5cabe
ci: scancel orphaned SLURM jobs when GitHub Actions cancels the runner
sbryngelson Mar 8, 2026
cf4f2a6
Fix Phoenix CPU test: restore build cache to isolate concurrent jobs
sbryngelson Mar 8, 2026
7abbce7
Revert "Fix Phoenix CPU test: restore build cache to isolate concurre…
sbryngelson Mar 8, 2026
df23011
Fix Phoenix test: pass explicit GPU flag to test command
sbryngelson Mar 8, 2026
8f586ae
ci: remove self-hosted runner build cache
sbryngelson Mar 8, 2026
24f25f3
ci: nuke entire build dir on attempt 3 of retry_build
sbryngelson Mar 8, 2026
0104233
ci: reduce to 2 attempts, nuke build dir on retry
sbryngelson Mar 8, 2026
ffb43f7
ci: revert case-opt to clean: false to preserve SLURM build cache
sbryngelson Mar 8, 2026
fb6101d
ci: treat PREEMPTED as non-terminal so --requeue jobs keep being moni…
sbryngelson Mar 8, 2026
68592d7
ci: clean build dir before case-opt pre-build; drop retry
sbryngelson Mar 8, 2026
0775fde
ci: remove dead RETRY_CLEAN_CMD from bench.sh
sbryngelson Mar 8, 2026
aa21620
ci: allow Frontier jobs to fail without blocking workflow
sbryngelson Mar 8, 2026
18311b8
ci: fix shellcheck SC2162 - use read -r in while loops
sbryngelson Mar 8, 2026
f572dcf
bench: prefer rtx6000/l40s/v100 over h200/h100/a100 for GPU partition
sbryngelson Mar 9, 2026
8f298d1
ci: decouple SLURM submit from monitor for Phoenix jobs (Option 2)
sbryngelson Mar 9, 2026
0819b0e
Merge upstream/master: CCE 19.0.0 workaround, cache/build improvements
sbryngelson Mar 9, 2026
38df383
ci: fix --precision flag and remove Python 3.14 step in github job
sbryngelson Mar 9, 2026
07c4ab0
ci: fix fallback partition message, remove dead RETRY_CLEAN_CMD, fix …
sbryngelson Mar 9, 2026
1c81fc0
ci: submit-job.sh always submits fresh, cancels any stale SLURM job f…
sbryngelson Mar 9, 2026
0a39803
ci: fix heredoc pwd expansion, backtick substitution, combine bench l…
sbryngelson Mar 9, 2026
e686654
ci: remove redundant slurm_job_id write, improve bench log output
sbryngelson Mar 9, 2026
b97320b
ci: add explanatory comments, fix backtick in submit.sh
sbryngelson Mar 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions .github/scripts/monitor_slurm_job.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,17 @@ cleanup() {
if [ -n "${tail_pid:-}" ]; then
kill "${tail_pid}" 2>/dev/null || true
fi
# Cancel the SLURM job if the monitor is exiting due to an error
# (e.g., the CI runner is being killed). Don't cancel on success.
# Cancel the SLURM job only if it is still active in the scheduler.
# If the job already left the queue (squeue returns empty), it has finished
# and run_monitored_slurm_job.sh will recover via sacct — don't cancel it.
if [ "${monitor_success:-0}" -ne 1 ] && [ -n "${job_id:-}" ]; then
echo "Monitor exiting abnormally — cancelling SLURM job $job_id"
scancel "$job_id" 2>/dev/null || true
active_state=$(squeue -j "$job_id" -h -o '%T' 2>/dev/null | head -n1 | tr -d ' ' || echo "")
if [ -n "$active_state" ]; then
echo "Monitor exiting abnormally — cancelling SLURM job $job_id (state: $active_state)"
scancel "$job_id" 2>/dev/null || true
else
echo "Monitor exiting abnormally — SLURM job $job_id already left queue, not cancelling"
fi
fi
}
trap cleanup EXIT
Expand Down Expand Up @@ -56,9 +62,11 @@ get_job_state() {
}

# Check if a state is terminal (job is done, for better or worse)
# PREEMPTED is intentionally excluded: with --requeue the job restarts under
# the same job ID and we must keep monitoring rather than exiting early.
is_terminal_state() {
case "$1" in
COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|PREEMPTED|REVOKED)
COMPLETED|FAILED|CANCELLED|CANCELLED+|TIMEOUT|OUT_OF_MEMORY|NODE_FAIL|BOOT_FAIL|DEADLINE|REVOKED)
return 0 ;;
*)
return 1 ;;
Expand All @@ -74,7 +82,7 @@ while [ ! -f "$output_file" ]; do
state=$(get_job_state "$job_id")

case "$state" in
PENDING|CONFIGURING)
PENDING|CONFIGURING|PREEMPTED)
unknown_count=0
sleep 5
;;
Expand Down
2 changes: 2 additions & 0 deletions .github/scripts/prebuild-case-optimization.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ case "$cluster" in
*) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
esac

rm -rf build

. ./mfc.sh load -c "$flag" -m g
source .github/scripts/gpu-opts.sh

Expand Down
31 changes: 7 additions & 24 deletions .github/scripts/retry-build.sh
Original file line number Diff line number Diff line change
@@ -1,30 +1,13 @@
#!/bin/bash
# Provides retry_build(): 3-attempt loop with configurable cleanup.
# Set RETRY_CLEAN_CMD to override cleanup (default: rm -rf build/staging build/install build/lock.yaml).
# Provides retry_build(): 2-attempt loop.
# On failure of attempt 1, nukes the entire build directory before attempt 2.
# Set RETRY_VALIDATE_CMD to run a post-build validation; failure triggers a retry.
# Usage: source .github/scripts/retry-build.sh
# retry_build ./mfc.sh build -j 8 --gpu acc

# Try normal cleanup; if it fails, escalate to cache nuke.
_retry_clean() {
local clean_cmd="$1"
if eval "$clean_cmd" 2>/dev/null; then
return 0
fi
echo " Normal cleanup failed."
if type _cache_nuke > /dev/null 2>&1; then
echo " Escalating to NFS cache nuke..."
_cache_nuke
else
echo " _cache_nuke not available, best-effort rm."
rm -rf build/staging build/install build/lock.yaml 2>/dev/null || true
fi
}

retry_build() {
local clean_cmd="${RETRY_CLEAN_CMD:-rm -rf build/staging build/install build/lock.yaml}"
local validate_cmd="${RETRY_VALIDATE_CMD:-}"
local max_attempts=3
local max_attempts=2
local attempt=1
while [ $attempt -le $max_attempts ]; do
echo "Build attempt $attempt of $max_attempts..."
Expand All @@ -33,8 +16,8 @@ retry_build() {
if ! eval "$validate_cmd"; then
echo "Post-build validation failed on attempt $attempt."
if [ $attempt -lt $max_attempts ]; then
echo "Cleaning and retrying in 5s..."
_retry_clean "$clean_cmd"
echo " Nuking build directory before retry..."
rm -rf build 2>/dev/null || true
sleep 5
attempt=$((attempt + 1))
continue
Expand All @@ -48,8 +31,8 @@ retry_build() {
return 0
fi
if [ $attempt -lt $max_attempts ]; then
echo "Build failed on attempt $attempt. Retrying in 30s..."
_retry_clean "$clean_cmd"
echo " Build failed — nuking build directory before retry..."
rm -rf build 2>/dev/null || true
sleep 30
else
echo "Build failed after $max_attempts attempts."
Expand Down
6 changes: 4 additions & 2 deletions .github/scripts/run_monitored_slurm_job.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ if [ "$monitor_exit" -ne 0 ]; then
echo "Monitor exited with code $monitor_exit; re-checking SLURM job $job_id final state..."
# Give the SLURM epilog time to finalize if the job just finished
sleep 30
final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || echo "UNKNOWN")
final_exit=$(sacct -j "$job_id" --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || echo "")
final_state=$(sacct -j "$job_id" -n -X -P -o State 2>/dev/null | head -n1 | cut -d'|' -f1 | tr -d ' ' || true)
final_state="${final_state:-UNKNOWN}"
final_exit=$(sacct -j "$job_id" -X --format=ExitCode --noheader --parsable2 2>/dev/null | head -n1 | tr -d ' ' || true)
final_exit="${final_exit:-}"
echo "Final SLURM state=$final_state exit=$final_exit"
if [ "$final_state" = "COMPLETED" ] && [ "$final_exit" = "0:0" ]; then
echo "SLURM job $job_id completed successfully despite monitor failure — continuing."
Expand Down
29 changes: 29 additions & 0 deletions .github/scripts/run_parallel_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,31 @@ echo "=========================================="
echo "Starting parallel benchmark jobs..."
echo "=========================================="

# For Phoenix GPU benchmarks, select a consistent GPU partition before launching
# both parallel jobs so PR and master always land on the same GPU type.
if [ "$device" = "gpu" ] && [ "$cluster" = "phoenix" ]; then
echo "Selecting Phoenix GPU partition for benchmark consistency..."
# Prefer older/smaller partitions first (rtx6000, l40s, v100) to leave
# large modern nodes (h200, h100, a100) free for production workloads.
# rtx6000 has the most nodes and gives the most consistent baselines.
BENCH_GPU_PARTITION=""
for part in gpu-rtx6000 gpu-l40s gpu-v100 gpu-h200 gpu-h100 gpu-a100; do
# || true: grep -c exits 1 on zero matches (or when sinfo returns no output
# for an unknown partition); suppress so set -euo pipefail doesn't abort.
idle=$(sinfo -p "$part" --noheader -o "%t" 2>/dev/null | grep -cE "^(idle|mix)" || true)
if [ "${idle:-0}" -gt 0 ]; then
BENCH_GPU_PARTITION="$part"
echo "Selected GPU partition: $BENCH_GPU_PARTITION ($idle idle/mix nodes)"
break
fi
done
if [ -z "$BENCH_GPU_PARTITION" ]; then
echo "WARNING: No idle GPU partition found; falling back to gpu-rtx6000 (may queue)"
BENCH_GPU_PARTITION="gpu-rtx6000"
fi
export BENCH_GPU_PARTITION
fi

# Run both jobs with monitoring using dedicated script from PR
# Use stdbuf for line-buffered output and prefix each line for clarity
(set -o pipefail; stdbuf -oL -eL bash "${SCRIPT_DIR}/submit_and_monitor_bench.sh" pr "$device" "$interface" "$cluster" 2>&1 | while IFS= read -r line; do echo "[PR] $line"; done) &
Expand All @@ -40,6 +65,8 @@ wait "$pr_pid"
pr_exit=$?
if [ "$pr_exit" -ne 0 ]; then
echo "PR job exited with code: $pr_exit"
echo "Last 50 lines of PR job log:"
tail -n 50 "pr/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read PR log"
else
echo "PR job completed successfully"
fi
Expand All @@ -48,6 +75,8 @@ wait "$master_pid"
master_exit=$?
if [ "$master_exit" -ne 0 ]; then
echo "Master job exited with code: $master_exit"
echo "Last 50 lines of master job log:"
tail -n 50 "master/bench-${device}-${interface}.out" 2>/dev/null || echo " Could not read master log"
else
echo "Master job completed successfully"
fi
Expand Down
101 changes: 0 additions & 101 deletions .github/scripts/setup-build-cache.sh

This file was deleted.

12 changes: 9 additions & 3 deletions .github/scripts/submit_and_monitor_bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,18 @@ device="$2"
interface="$3"
cluster="$4"

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

echo "[$dir] Submitting benchmark for $device-$interface on $cluster..."
cd "$dir"

# Submit and monitor job (submit.sh auto-detects bench mode from script name)
bash .github/workflows/$cluster/submit.sh \
.github/workflows/$cluster/bench.sh "$device" "$interface"
# Always use the PR's submit.sh so both master and PR builds benefit from the
# run_monitored_slurm_job.sh SIGKILL recovery wrapper. The bench script is
# still resolved relative to the current directory (master/ or pr/) so the
# correct branch code is benchmarked. SLURM_SUBMIT_DIR ensures the job runs
# in the right directory regardless of which submit.sh is invoked.
PR_SUBMIT="${SCRIPT_DIR}/../workflows/${cluster}/submit.sh"
bash "$PR_SUBMIT" .github/workflows/$cluster/bench.sh "$device" "$interface"

# Verify the YAML output file was created
job_slug="bench-$device-$interface"
Expand Down
39 changes: 35 additions & 4 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ jobs:
device: gpu
interface: omp
build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench"
continue-on-error: ${{ matrix.cluster == 'frontier' || matrix.cluster == 'frontier_amd' }}
runs-on:
group: ${{ matrix.group }}
labels: ${{ matrix.labels }}
Expand All @@ -106,7 +107,7 @@ jobs:
if: matrix.build_script != ''
uses: nick-fields/retry@v3
with:
max_attempts: 3
max_attempts: 2
retry_wait_seconds: 60
timeout_minutes: 150
command: |
Expand All @@ -118,13 +119,20 @@ jobs:
wait $pid2; e2=$?
[ $e1 -eq 0 ] && [ $e2 -eq 0 ]
on_retry_command: |
(cd pr && ./mfc.sh clean) &
(cd master && ./mfc.sh clean) &
wait
rm -rf pr/build master/build

- name: Bench (Master v. PR)
run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}

- name: Cancel SLURM Jobs
if: cancelled()
run: |
find . -name "*.slurm_job_id" | while read -r f; do
job_id=$(cat "$f")
echo "Cancelling SLURM job $job_id"
scancel "$job_id" 2>/dev/null || true
done

- name: Generate & Post Comment
if: always()
run: |
Expand All @@ -137,6 +145,29 @@ jobs:
cat pr/bench-${{ matrix.device }}-${{ matrix.interface }}.* 2>/dev/null || true
cat master/bench-${{ matrix.device }}-${{ matrix.interface }}.* 2>/dev/null || true

- name: Print Per-Case Logs
if: always()
run: |
passed=() failed=()
for out in pr/build/benchmarks/*/*.out master/build/benchmarks/*/*.out; do
[ -f "$out" ] || continue
[ -f "${out%.out}.yaml" ] && passed+=("$out") || failed+=("$out")
done

echo "=== Per-Case Summary: ${#failed[@]} failed, ${#passed[@]} passed ==="
for out in "${failed[@]}"; do echo " [FAILED] $out"; done
for out in "${passed[@]}"; do echo " [PASSED] $out"; done

if [ ${#failed[@]} -gt 0 ]; then
echo ""
echo "=== Failed Case Logs ==="
for out in "${failed[@]}"; do
echo "--- $out ---"
cat "$out"
echo ""
done
fi

# All other runners (non-Phoenix) just run without special env
- name: Archive Logs (Frontier)
if: always() && matrix.cluster != 'phoenix'
Expand Down
5 changes: 4 additions & 1 deletion .github/workflows/frontier/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@

source .github/scripts/bench-preamble.sh

# Cap parallel jobs at 64 to avoid overwhelming MPI daemons on large nodes.
n_jobs=$(( $(nproc) > 64 ? 64 : $(nproc) ))

if [ "$job_device" = "gpu" ]; then
./mfc.sh bench --mem 4 -j $n_ranks -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
else
./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
./mfc.sh bench --mem 1 -j $n_jobs -o "$job_slug.yaml" -- -c $job_cluster $device_opts -n $n_ranks
fi
Loading
Loading