From a22f2061c7b7841172ac5724c21fd8e65563cc7d Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Wed, 25 Mar 2026 10:47:26 -0400 Subject: [PATCH 1/5] Add --deps-only flag to separate dependency fetching from source builds This allows CI to fetch and build dependencies (FFTW, HDF5, etc.) on login nodes with internet access, then build MFC source code on compute nodes that may have no network connectivity. Key changes: - New --deps-only CLI flag for ./mfc.sh build - Already-configured dependencies are skipped entirely during regular builds, guaranteeing no network access in the source build step - Frontier and Frontier AMD now follow the pattern: deps on login node, source build + test on compute node --- .github/workflows/bench.yml | 8 ++++---- .github/workflows/common/bench.sh | 12 ++++++------ .github/workflows/common/test.sh | 30 ++++++++++++++--------------- .github/workflows/frontier/build.sh | 7 +------ .github/workflows/test.yml | 4 ++-- toolchain/mfc/build.py | 26 +++++++++++++++++++++++++ toolchain/mfc/cli/commands.py | 7 +++++++ 7 files changed, 61 insertions(+), 33 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 7ce02c1e3f..d39831730d 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -68,7 +68,7 @@ jobs: flag: f device: gpu interface: acc - build_script: "bash .github/workflows/frontier/build.sh gpu acc bench" + build_script: "bash .github/workflows/frontier/build.sh gpu acc" - cluster: frontier name: Oak Ridge | Frontier (CCE) group: phoenix @@ -76,7 +76,7 @@ jobs: flag: f device: gpu interface: omp - build_script: "bash .github/workflows/frontier/build.sh gpu omp bench" + build_script: "bash .github/workflows/frontier/build.sh gpu omp" - cluster: frontier_amd name: Oak Ridge | Frontier (AMD) group: phoenix @@ -84,7 +84,7 @@ jobs: flag: famd device: gpu interface: omp - build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench" + build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp" continue-on-error: ${{ matrix.cluster == 'frontier' || matrix.cluster == 'frontier_amd' }} runs-on: group: ${{ matrix.group }} @@ -103,7 +103,7 @@ jobs: ref: master path: master - - name: Setup & Build + - name: Fetch Dependencies if: matrix.build_script != '' timeout-minutes: 150 run: | diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh index 66d77cfd99..be83e57b87 100644 --- a/.github/workflows/common/bench.sh +++ b/.github/workflows/common/bench.sh @@ -21,18 +21,18 @@ if [ "$job_cluster" = "phoenix" ]; then trap 'rm -rf "$currentdir" || true' EXIT fi -# --- Build (if not pre-built on login node) --- -# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. +# --- Build --- +# Phoenix builds everything inside SLURM (no login-node build step). +# Frontier/Frontier AMD: deps already fetched on login node via --deps-only; +# source code is built here on the compute node. # Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk). if [ "$job_cluster" = "phoenix" ]; then source .github/scripts/clean-build.sh clean_build fi -if [ ! -d "build" ]; then - source .github/scripts/retry-build.sh - retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1 -fi +source .github/scripts/retry-build.sh +retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1 # --- Bench cluster flag --- if [ "$job_cluster" = "phoenix" ]; then diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh index e155fd48f8..9eb116e183 100644 --- a/.github/workflows/common/test.sh +++ b/.github/workflows/common/test.sh @@ -21,29 +21,29 @@ if [ "$job_cluster" = "phoenix" ]; then trap 'rm -rf "$currentdir" || true' EXIT fi -# --- Build (if not pre-built on login node) --- -# Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. -# Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh -# to avoid SIGILL from stale binaries compiled on a different microarchitecture. +# --- Build --- +# Phoenix builds everything inside SLURM (no login-node build step). +# Frontier/Frontier AMD: deps already fetched on login node via --deps-only; +# source code is built here on the compute node. +# Phoenix: always start fresh to avoid SIGILL from stale binaries compiled +# on a different microarchitecture. if [ "$job_cluster" = "phoenix" ]; then source .github/scripts/clean-build.sh clean_build fi -if [ ! -d "build" ]; then - source .github/scripts/retry-build.sh +source .github/scripts/retry-build.sh - # Phoenix: smoke-test the syscheck binary to catch architecture mismatches - # (SIGILL from binaries compiled on a different compute node). - validate_cmd="" - if [ "$job_cluster" = "phoenix" ]; then - validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1' - fi - - RETRY_VALIDATE_CMD="$validate_cmd" \ - retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1 +# Phoenix: smoke-test the syscheck binary to catch architecture mismatches +# (SIGILL from binaries compiled on a different compute node). +validate_cmd="" +if [ "$job_cluster" = "phoenix" ]; then + validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1' fi +RETRY_VALIDATE_CMD="$validate_cmd" \ + retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1 + # --- GPU detection and thread count --- device_opts="" rdma_opts="" diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index 5bd40999d7..cd289ef074 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -14,7 +14,6 @@ esac job_device=$1 job_interface=$2 -run_bench=$3 source .github/scripts/gpu-opts.sh build_opts="$gpu_opts" @@ -24,8 +23,4 @@ source .github/scripts/clean-build.sh clean_build source .github/scripts/retry-build.sh -if [ "$run_bench" == "bench" ]; then - retry_build ./mfc.sh build -j 8 $build_opts || exit 1 -else - retry_build ./mfc.sh test -v -a --dry-run $([ "$cluster_name" = "frontier" ] && echo "--rdma-mpi") -j 8 $build_opts || exit 1 -fi +retry_build ./mfc.sh build --deps-only -j 8 $build_opts || exit 1 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 90ad965c52..c13e5c22e3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -400,12 +400,12 @@ jobs: echo "Coverage cache: none available — full test suite will run" fi - - name: Build (login node) + - name: Fetch Dependencies if: matrix.cluster != 'phoenix' timeout-minutes: 60 run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }} - - name: Test + - name: Build & Test run: bash .github/scripts/submit-slurm-job.sh .github/workflows/common/test.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} ${{ matrix.shard }} - name: Cancel SLURM Jobs diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index d6daf97bb6..6173fbfb0e 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -552,6 +552,12 @@ def __build_target(target: typing.Union[MFCTarget, str], case: input.MFCInputFil history.add(target.name) + # Dependencies are pinned to fixed versions. If already configured + # (built & installed by a prior --deps-only step), skip entirely + # to avoid re-entering the superbuild (which may access the network). + if target.isDependency and target.is_configured(case): + return + for dep in target.requires.compute(): # If we have already built and installed this target, # do not do so again. This can be inferred by whether @@ -594,6 +600,26 @@ def build(targets=None, case: input.MFCInputFile = None, history: typing.Set[str case = case or input.load(ARG("input"), ARG("--"), {}) case.validate_params() + if ARG("deps_only", False): + all_deps = set() + for t in targets: + resolved = get_target(t) + for dep in resolved.requires.compute(): + all_deps.add(dep) + + if len(history) == 0: + cons.print(f"[bold]Fetch Dependencies | {format_list_to_string([d.name for d in all_deps], 'magenta', 'None')}[/bold]") + cons.print(no_indent=True) + + if not all_deps: + cons.print("[yellow]No dependencies to build for the requested targets.[/yellow]") + return + + for dep in all_deps: + __build_target(dep, case, history) + + return + if len(history) == 0: cons.print(__generate_header(case, targets)) cons.print(no_indent=True) diff --git a/toolchain/mfc/cli/commands.py b/toolchain/mfc/cli/commands.py index 85aab95031..e98003aa74 100644 --- a/toolchain/mfc/cli/commands.py +++ b/toolchain/mfc/cli/commands.py @@ -134,6 +134,13 @@ default=False, dest="case_optimization", ), + Argument( + name="deps-only", + help="Only fetch and build dependencies, do not build MFC targets.", + action=ArgAction.STORE_TRUE, + default=False, + dest="deps_only", + ), ], examples=[ Example("./mfc.sh build", "Build all default targets (CPU)"), From 8d39c7b982187b00305bd1996455d2abc83b45f0 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Wed, 25 Mar 2026 13:50:45 -0400 Subject: [PATCH 2/5] Fix --deps-only recursive build: only gate top-level call When __build_target recursively called build() to resolve sub-deps (e.g. SILO depends on HDF5), the --deps-only guard intercepted the recursive call and only built deps-of-deps (none for HDF5), never building HDF5 itself. This caused SILO's configure to fail with 'HDF5 was not found'. Fix: only enter the --deps-only path on the top-level call (history is empty), letting recursive calls proceed normally. --- toolchain/mfc/build.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index 6173fbfb0e..9fed43c271 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -600,16 +600,15 @@ def build(targets=None, case: input.MFCInputFile = None, history: typing.Set[str case = case or input.load(ARG("input"), ARG("--"), {}) case.validate_params() - if ARG("deps_only", False): + if ARG("deps_only", False) and len(history) == 0: all_deps = set() for t in targets: resolved = get_target(t) for dep in resolved.requires.compute(): all_deps.add(dep) - if len(history) == 0: - cons.print(f"[bold]Fetch Dependencies | {format_list_to_string([d.name for d in all_deps], 'magenta', 'None')}[/bold]") - cons.print(no_indent=True) + cons.print(f"[bold]Fetch Dependencies | {format_list_to_string([d.name for d in all_deps], 'magenta', 'None')}[/bold]") + cons.print(no_indent=True) if not all_deps: cons.print("[yellow]No dependencies to build for the requested targets.[/yellow]") From 7317ad54a65048c24f86f54c0082940b6e2e202f Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Wed, 25 Mar 2026 15:10:19 -0400 Subject: [PATCH 3/5] ci: move case-optimization source builds to compute nodes for Frontier Frontier/Frontier AMD now follow the same deps-on-login, source-on-compute pattern for case optimization tests. Previously, prebuild-case-optimization.sh built deps + source on the login node. Now: - Login node: build.sh fetches deps via --deps-only - Compute node: run_case_optimization.sh builds case-optimized binaries then runs them Phoenix is unchanged (prebuild + run both in SLURM). --- .github/scripts/run_case_optimization.sh | 19 ++++++++++++------- .github/workflows/test.yml | 9 +++++++-- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/.github/scripts/run_case_optimization.sh b/.github/scripts/run_case_optimization.sh index 21b6ff0b6f..edc01dd6c9 100755 --- a/.github/scripts/run_case_optimization.sh +++ b/.github/scripts/run_case_optimization.sh @@ -13,13 +13,6 @@ if [ "$job_device" = "gpu" ] && [ "$ngpus" -eq 0 ]; then ngpus=1 fi -# Verify the venv Python interpreter exists (created by ./mfc.sh build) -if [ ! -x build/venv/bin/python3 ]; then - echo "ERROR: build/venv/bin/python3 not found." - echo "The MFC build venv may not have been created. Was the pre-build step successful?" - exit 1 -fi - benchmarks=( benchmarks/5eq_rk3_weno3_hllc/case.py benchmarks/viscous_weno5_sgb_acoustic/case.py @@ -28,6 +21,18 @@ benchmarks=( benchmarks/igr/case.py ) +# For Frontier/Frontier AMD: deps were fetched on the login node via --deps-only; +# build case-optimized binaries here on the compute node before running. +# For Phoenix: prebuild-case-optimization.sh already built everything in a prior SLURM job. +if [ "$job_cluster" != "phoenix" ]; then + echo "=== Building case-optimized binaries on compute node ===" + for case in "${benchmarks[@]}"; do + echo "--- Building: $case ---" + ./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8 + done + echo "=== All case-optimized binaries built ===" +fi + passed=0 failed=0 failed_cases="" diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c13e5c22e3..d40a44f04f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -486,15 +486,20 @@ jobs: - name: Clean stale output files run: rm -f *.out + - name: Fetch Dependencies + if: matrix.cluster != 'phoenix' + run: bash .github/workflows/${{ matrix.cluster }}/build.sh ${{ matrix.device }} ${{ matrix.interface }} + - name: Pre-Build (SLURM) if: matrix.cluster == 'phoenix' run: bash .github/scripts/submit-slurm-job.sh .github/scripts/prebuild-case-optimization.sh cpu ${{ matrix.interface }} ${{ matrix.cluster }} - - name: Pre-Build (login node) + - name: Build & Run Case-Optimization Tests if: matrix.cluster != 'phoenix' - run: bash .github/scripts/prebuild-case-optimization.sh ${{ matrix.cluster }} ${{ matrix.device }} ${{ matrix.interface }} + run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} - name: Run Case-Optimization Tests + if: matrix.cluster == 'phoenix' run: bash .github/scripts/submit-slurm-job.sh .github/scripts/run_case_optimization.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }} - name: Cancel SLURM Jobs From 750b1fd8d81dbebf212bdb7acb7ebeb18e0c37a5 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Wed, 25 Mar 2026 18:52:55 -0400 Subject: [PATCH 4/5] ci: use -j 1 for Frontier Cray builds to work around CCE 19.0.0 IPA SIGSEGV --- .github/scripts/run_case_optimization.sh | 8 +++++++- .github/workflows/common/bench.sh | 5 +++++ .github/workflows/common/test.sh | 8 +++++++- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/.github/scripts/run_case_optimization.sh b/.github/scripts/run_case_optimization.sh index edc01dd6c9..bd54aedc05 100755 --- a/.github/scripts/run_case_optimization.sh +++ b/.github/scripts/run_case_optimization.sh @@ -24,11 +24,17 @@ benchmarks=( # For Frontier/Frontier AMD: deps were fetched on the login node via --deps-only; # build case-optimized binaries here on the compute node before running. # For Phoenix: prebuild-case-optimization.sh already built everything in a prior SLURM job. +# Frontier Cray: -j 1 to work around CCE 19.0.0 IPA SIGSEGV +build_jobs=8 +if [ "$job_cluster" = "frontier" ]; then + build_jobs=1 +fi + if [ "$job_cluster" != "phoenix" ]; then echo "=== Building case-optimized binaries on compute node ===" for case in "${benchmarks[@]}"; do echo "--- Building: $case ---" - ./mfc.sh build -i "$case" --case-optimization $gpu_opts -j 8 + ./mfc.sh build -i "$case" --case-optimization $gpu_opts -j $build_jobs done echo "=== All case-optimized binaries built ===" fi diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh index be83e57b87..9522e3a043 100644 --- a/.github/workflows/common/bench.sh +++ b/.github/workflows/common/bench.sh @@ -31,6 +31,11 @@ if [ "$job_cluster" = "phoenix" ]; then clean_build fi +# Frontier Cray: -j 1 to work around CCE 19.0.0 IPA SIGSEGV +if [ "$job_cluster" = "frontier" ]; then + n_jobs=1 +fi + source .github/scripts/retry-build.sh retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1 diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh index 9eb116e183..2733235549 100644 --- a/.github/workflows/common/test.sh +++ b/.github/workflows/common/test.sh @@ -41,8 +41,14 @@ if [ "$job_cluster" = "phoenix" ]; then validate_cmd='syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1); [ -z "$syscheck_bin" ] || "$syscheck_bin" > /dev/null 2>&1' fi +# Frontier Cray: -j 1 to work around CCE 19.0.0 IPA SIGSEGV +build_jobs=8 +if [ "$job_cluster" = "frontier" ]; then + build_jobs=1 +fi + RETRY_VALIDATE_CMD="$validate_cmd" \ - retry_build ./mfc.sh test -v --dry-run -j 8 $build_opts || exit 1 + retry_build ./mfc.sh test -v --dry-run -j $build_jobs $build_opts || exit 1 # --- GPU detection and thread count --- device_opts="" From 3adf7dc89f11d6b99242eb6fbb305d48811280ca Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 26 Mar 2026 10:30:46 -0400 Subject: [PATCH 5/5] ci: suppress Cray warnings 990 and 7208 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fb77271a37..04c51eafc3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -176,7 +176,7 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") endif() elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray") add_compile_options( - "SHELL:-M 296,878,1391,1069,5025" + "SHELL:-M 296,878,1391,1069,990,5025,7208" "SHELL:-h static" "SHELL:-h keepfiles" "SHELL:-h acc_model=auto_async_none" "SHELL: -h acc_model=no_fast_addr"