MFlowCode · Malmahrouqi3 · Jun 11, 2025 · Jun 11, 2025 · Jun 11, 2025 · Jun 11, 2025
@@ -4,7 +4,7 @@ gpus=`rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n'
 ngpus=`echo "$gpus" | tr -d '[:space:]' | wc -c`
 
 if [ "$job_device" == "gpu" ]; then
-    ./mfc.sh test --max-attempts 3 -j $ngpus -- -c frontier
+    ./mfc.sh test -a --rdma-mpi --max-attempts 3 -j $ngpus -- -c frontier
 else
-    ./mfc.sh test --max-attempts 3 -j 32 -- -c frontier
+    ./mfc.sh test -a --rdma-mpi --max-attempts 3 -j 32 -- -c frontier
 fi
@@ -16,6 +16,7 @@ A test is considered passing when our error tolerances are met in order to maint
 - `--percent` (`%`) to specify a percentage of the test suite to select at random and test
 - `--max-attempts` (`-m`) the maximum number of attempts to make on a test before considering it failed
 - `--no-examples` skips the testing of cases in the examples folder
+- `--rdma-mpi` runs additional tests where RDMA MPI is enabled.
 
 To specify a computer, pass the `-c` flag to `./mfc.sh run` like so:
 ```shell

@@ -76,14 +76,15 @@ def add_common_arguments(p, mask = None):
     test.add_argument("-l", "--list",         action="store_true", help="List all available tests.")
     test.add_argument("-f", "--from",         default=test_cases[0].get_uuid(), type=str, help="First test UUID to run.")
     test.add_argument("-t", "--to",           default=test_cases[-1].get_uuid(), type=str, help="Last test UUID to run.")
-    test.add_argument("-o", "--only",         nargs="+", type=str, default=[], metavar="L", help="Only run tests with specified properties.")
-    test.add_argument("-a", "--test-all",     action="store_true", default=False, help="Run the Post Process Tests too.")
-    test.add_argument("-%", "--percent",      type=int, default=100, help="Percentage of tests to run.")
-    test.add_argument("-m", "--max-attempts", type=int, default=1, help="Maximum number of attempts to run a test.")
-    test.add_argument(      "--no-build",     action="store_true",                    default=False,      help="(Testing) Do not rebuild MFC.")
-    test.add_argument(      "--no-examples",  action="store_true",                    default=False,      help="Do not test example cases." )
-    test.add_argument("--case-optimization",  action="store_true", default=False, help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")
-    test.add_argument(      "--dry-run",      action="store_true",                    default=False,      help="Build and generate case files but do not run tests.")
+    test.add_argument("-o", "--only",         nargs="+", type=str,     default=[], metavar="L", help="Only run tests with specified properties.")
+    test.add_argument("-a", "--test-all",     action="store_true",     default=False,     help="Run the Post Process Tests too.")
+    test.add_argument("-%", "--percent",      type=int,                default=100,       help="Percentage of tests to run.")
+    test.add_argument("-m", "--max-attempts", type=int,                default=1,         help="Maximum number of attempts to run a test.")
+    test.add_argument(      "--rdma-mpi",     action="store_true",     default=False,     help="Run tests with RDMA MPI enabled")
+    test.add_argument(      "--no-build",     action="store_true",     default=False,     help="(Testing) Do not rebuild MFC.")
+    test.add_argument(      "--no-examples",  action="store_true",     default=False,     help="Do not test example cases." )
+    test.add_argument("--case-optimization",  action="store_true",     default=False,     help="(GPU Optimization) Compile MFC targets with some case parameters hard-coded.")
+    test.add_argument(      "--dry-run",      action="store_true",     default=False,     help="Build and generate case files but do not run tests.")
 
     test_meg = test.add_mutually_exclusive_group()
     test_meg.add_argument("--generate",          action="store_true", default=False, help="(Test Generation) Generate golden files.")

@@ -133,7 +133,7 @@ def run(self, targets: List[Union[str, MFCTarget]], gpus: Set[int]) -> subproces
         filepath          = f'{self.get_dirpath()}/case.py'
         tasks             = ["-n", str(self.ppn)]
         jobs              = ["-j", str(ARG("jobs"))] if ARG("case_optimization") else []
-        case_optimization = ["--case-optimization"] if ARG("case_optimization") else []
+        case_optimization = ["--case-optimization"]  if ARG("case_optimization") else []
 
         if self.params.get("bubbles_lagrange", 'F') == 'T':
             input_bubbles_lagrange(self)

@@ -320,9 +320,10 @@ def alter_3d():
     def alter_ppn(dimInfo):
         if len(dimInfo[0]) == 3:
             cases.append(define_case_d(stack, '2 MPI Ranks', {'m': 29, 'n': 29, 'p': 49}, ppn=2))
+            cases.append(define_case_d(stack, '2 MPI Ranks -> RDMA MPI', {'m': 29, 'n': 29, 'p': 49, 'rdma_mpi': 'T'}, ppn=2))
         else:
             cases.append(define_case_d(stack, '2 MPI Ranks', {}, ppn=2))
-
+            cases.append(define_case_d(stack, '2 MPI Ranks -> RDMA MPI', {'rdma_mpi': 'T'}, ppn=2))
 
     def alter_ib(dimInfo, six_eqn_model=False):
         for slip in [True, False]:

@@ -56,15 +56,19 @@ def __filter(cases_) -> typing.List[TestCase]:
         if case.ppn > 1 and not ARG("mpi"):
             cases.remove(case)
             skipped_cases.append(case)
-
+
+    for case in cases[:]:
+        if "RDMA MPI" in case.trace and not ARG("rdma_mpi"):
+            cases.remove(case)
+            skipped_cases.append(case)
+
     for case in cases[:]:
         if ARG("single"):
             skip = ['low_Mach', 'Hypoelasticity', 'teno', 'Chemistry', 'Phase Change model 6'
             ,'Axisymmetric', 'Transducer', 'Transducer Array', 'Cylindrical', 'HLLD', 'Example']
             if any(label in case.trace for label in skip):
                 cases.remove(case)
 
-
     if ARG("no_examples"):
         cases = [case for case in cases if not "Example" in case.trace]
 
@@ -180,7 +184,11 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
         cons.print(f"  [bold magenta]{case.get_uuid()}[/bold magenta]     SKIP     {case.trace}")
         return
 
-    cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)
+    if "RDMA MPI" in case.trace:
+        cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices, rdma_mpi=True)
+    else:
+        cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)
-    if "RDMA MPI" in case.trace:
-        cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices, rdma_mpi=True)
-    else:
-        cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)
+    rdma_mpi = "RDMA MPI" in case.trace
+    cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices, rdma_mpi=rdma_mpi)
-    if "RDMA MPI" in case.trace:
-        cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices, rdma_mpi=True)
-    else:
-        cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices)
+    rdma_mpi = "RDMA MPI" in case.trace
+    cmd = case.run([PRE_PROCESS, SIMULATION], gpus=devices, rdma_mpi=rdma_mpi)
+
     out_filepath = os.path.join(case.get_dirpath(), "out_pre_sim.txt")
 
     common.file_write(out_filepath, cmd.stdout)
@@ -223,6 +231,9 @@ def _handle_case(case: TestCase, devices: typing.Set[int]):
 
     if ARG("test_all"):
         case.delete_output()
+        # if ARG("rdma_mpi"):
+        #     cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices, rdma_mpi=True)
+        # else:
         cmd = case.run([PRE_PROCESS, SIMULATION, POST_PROCESS], gpus=devices)
         out_filepath = os.path.join(case.get_dirpath(), "out_post.txt")
         common.file_write(out_filepath, cmd.stdout)