Skip to content

Test Suite Flag --rdma-mpi Implemented (#598) #878

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 19 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/frontier/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ gpus=`rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n'
ngpus=`echo "$gpus" | tr -d '[:space:]' | wc -c`

if [ "$job_device" == "gpu" ]; then
./mfc.sh test --max-attempts 3 -j $ngpus -- -c frontier
./mfc.sh test -a --rdma-mpi --max-attempts 3 -j $ngpus -- -c frontier
else
./mfc.sh test --max-attempts 3 -j 32 -- -c frontier
./mfc.sh test -a --rdma-mpi --max-attempts 3 -j 32 -- -c frontier
fi
1 change: 1 addition & 0 deletions docs/documentation/testing.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ A test is considered passing when our error tolerances are met in order to maint
- `--percent` (`%`) to specify a percentage of the test suite to select at random and test
- `--max-attempts` (`-m`) the maximum number of attempts to make on a test before considering it failed
- `--no-examples` skips the testing of cases in the examples folder
- `--rdma-mpi` runs additional tests where RDMA MPI is enabled.

To specify a computer, pass the `-c` flag to `./mfc.sh run` like so:
```shell
Expand Down
17 changes: 9 additions & 8 deletions toolchain/mfc/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,15 @@ def add_common_arguments(p, mask = None):
test.add_argument("-l", "--list", action="store_true", help="List all available tests.")
test.add_argument("-f", "--from", default=test_cases[0].get_uuid(), type=str, help="First test UUID to run.")
test.add_argument("-t", "--to", default=test_cases[-1].get_uuid(), type=str, help="Last test UUID to run.")
test.add_argument("-o", "--only", nargs="+", type=str, default=[], metavar="L", help="Only run tests with specified properties.")
test.add_argument("-a", "--test-all", action="store_true", default=False, help="Run the Post Process Tests too.")
test.add_argument("-%", "--percent", type=int, default=100, help="Percentage of tests to run.")
test.add_argument("-m", "--max-attempts", type=int, default=1, help="Maximum number of attempts to run a test.")
test.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.")
test.add_argument( "--no-examples", action="store_true", default=False, help="Do not test example cases." )
test.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")
test.add_argument( "--dry-run", action="store_true", default=False, help="Build and generate case files but do not run tests.")
test.add_argument("-o", "--only", nargs="+", type=str, default=[], metavar="L", help="Only run tests with specified properties.")
test.add_argument("-a", "--test-all", action="store_true", default=False, help="Run the Post Process Tests too.")
test.add_argument("-%", "--percent", type=int, default=100, help="Percentage of tests to run.")
test.add_argument("-m", "--max-attempts", type=int, default=1, help="Maximum number of attempts to run a test.")
test.add_argument( "--rdma-mpi", action="store_true", default=False, help="Run tests with RDMA MPI enabled")
test.add_argument( "--no-build", action="store_true", default=False, help="(Testing) Do not rebuild MFC.")
test.add_argument( "--no-examples", action="store_true", default=False, help="Do not test example cases." )
test.add_argument("--case-optimization", action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")
test.add_argument( "--dry-run", action="store_true", default=False, help="Build and generate case files but do not run tests.")

test_meg = test.add_mutually_exclusive_group()
test_meg.add_argument("--generate", action="store_true", default=False, help="(Test Generation) Generate golden files.")
Expand Down
2 changes: 1 addition & 1 deletion toolchain/mfc/test/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subproces
filepath = f'{self.get_dirpath()}/case.py'
tasks = ["-n", str(self.ppn)]
jobs = ["-j", str(ARG("jobs"))] if ARG("case_optimization") else []
case_optimization = ["--case-optimization"] if ARG("case_optimization") else []
case_optimization = ["--case-optimization"] if ARG("case_optimization") else []

if self.params.get("bubbles_lagrange", 'F') == 'T':
input_bubbles_lagrange(self)
Expand Down
3 changes: 2 additions & 1 deletion toolchain/mfc/test/cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,9 +320,10 @@ def alter_3d():
def alter_ppn(dimInfo):
if len(dimInfo[0]) == 3:
cases.append(define_case_d(stack, '2 MPI Ranks', {'m': 29, 'n': 29, 'p': 49}, ppn=2))
cases.append(define_case_d(stack, '2 MPI Ranks -> RDMA MPI', {'m': 29, 'n': 29, 'p': 49, 'rdma_mpi': 'T'}, ppn=2))
else:
cases.append(define_case_d(stack, '2 MPI Ranks', {}, ppn=2))

cases.append(define_case_d(stack, '2 MPI Ranks -> RDMA MPI', {'rdma_mpi': 'T'}, ppn=2))

def alter_ib(dimInfo, six_eqn_model=False):
for slip in [True, False]:
Expand Down
17 changes: 14 additions & 3 deletions toolchain/mfc/test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,19 @@ def __filter(cases_) -> typing.List[TestCase]:
if case.ppn > 1 and not ARG("mpi"):
cases.remove(case)
skipped_cases.append(case)


for case in cases[:]:
if "RDMA MPI" in case.trace and not ARG("rdma_mpi"):
cases.remove(case)
skipped_cases.append(case)

for case in cases[:]:
if ARG("single"):
skip = ['low_Mach', 'Hypoelasticity', 'teno', 'Chemistry', 'Phase Change model 6'
,'Axisymmetric', 'Transducer', 'Transducer Array', 'Cylindrical', 'HLLD', 'Example']
if any(label in case.trace for label in skip):
cases.remove(case)


if ARG("no_examples"):
cases = [case for case in cases if not "Example" in case.trace]

Expand Down Expand Up @@ -180,7 +184,11 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
cons.print(f" [bold magenta]{case.get_uuid()}[/bold magenta] SKIP {case.trace}")
return

cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)
if "RDMA MPI" in case.trace:
cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices, rdma_mpi=True)
else:
cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)
Comment on lines +187 to +190
Copy link
Preview

Copilot AI Jun 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Consider consolidating the RDMA branch with the standard case.run invocation by merging it into a single call that conditionally adds rdma_mpi=True, to reduce duplication.

Suggested change
if "RDMA MPI" in case.trace:
cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices, rdma_mpi=True)
else:
cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)
rdma_mpi = "RDMA MPI" in case.trace
cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices, rdma_mpi=rdma_mpi)

Copilot uses AI. Check for mistakes.


out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt")

common.file_write(out_filepath, cmd.stdout)
Expand Down Expand Up @@ -223,6 +231,9 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):

if ARG("test_all"):
case.delete_output()
# if ARG("rdma_mpi"):
# cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices, rdma_mpi=True)
# else:
cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices)
out_filepath = os.path.join(case.get_dirpath(), "out_post.txt")
common.file_write(out_filepath, cmd.stdout)
Expand Down
Loading