diff --git a/.tmp/README.md b/.tmp/README.md new file mode 100644 index 00000000..db4f22c5 --- /dev/null +++ b/.tmp/README.md @@ -0,0 +1 @@ +This directory is being used for storing nightly test logs. diff --git a/.tmp/cffi_vs_cython_pingpong.png b/.tmp/cffi_vs_cython_pingpong.png new file mode 100644 index 00000000..ecc3d390 Binary files /dev/null and b/.tmp/cffi_vs_cython_pingpong.png differ diff --git a/.tmp/cffi_vs_cython_task-bench.png b/.tmp/cffi_vs_cython_task-bench.png new file mode 100644 index 00000000..1abd3e20 Binary files /dev/null and b/.tmp/cffi_vs_cython_task-bench.png differ diff --git a/.tmp/compare_pingpong.py b/.tmp/compare_pingpong.py new file mode 100644 index 00000000..ff4102ff --- /dev/null +++ b/.tmp/compare_pingpong.py @@ -0,0 +1,39 @@ +from os import listdir +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + +def parsefile(file, data): + f = open(file, 'r') + lines = f.readlines() + length = 0 + time = 0 + for l in lines: + if "RES:: [plain] " in l: + length = int(l.split()[2]) + time = float(l.split()[-1]) + d = { + 'file':file, + 'msg length':length, + 'Elapsed Time':time + } + data = data.append(d, ignore_index=True) + return data + +data = pd.DataFrame(columns =["file", "msg length", "Elapsed Time"]) +data = parsefile("udp-cffi-pingpong.log", data) +data = parsefile("udp-cython-pingpong.log", data) + +sns.set_theme(style="whitegrid") +df = data + +# Draw a pointplot to show pulse as a function of three categorical factors +g = sns.lineplot(x="msg length", y="Elapsed Time", hue="file", data=df,ci=99, linewidth=3, markersize=10, alpha = 0.5) +g.set(ylabel='Elapsed Time') +g.set(xlabel='msg length (ms)') +g.set_xscale('log', base=2) +g.set(title="cffi vs cython with the pingpong benchmark") +plt.subplots_adjust(bottom=0.15) +plt.savefig('cffi_vs_cython_pingpong.png') +print("Created cffi_vs_cython_pingpong.png") diff --git a/.tmp/compare_task_bench.py b/.tmp/compare_task_bench.py new file mode 100644 index 00000000..c6e04a5f --- /dev/null +++ b/.tmp/compare_task_bench.py @@ -0,0 +1,42 @@ +from os import listdir +import numpy as np +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt + +def parsefile(file, data): + f = open(file, 'r') + lines = f.readlines() + length = 0 + time = 0 + for l in lines: + if "Iterations: " in l: +# print(l.split()) + length = int(l.split()[-1]) + if "Elapsed Time " in l: +# print(l.split()) + time = float(l.split()[-2]) + d = { + 'file':file, + 'Iterations':length, + 'Elapsed Time':time + } + data = data.append(d, ignore_index=True) + return data + +data = pd.DataFrame(columns =["file", "Iterations", "Elapsed Time"]) +data = parsefile("udp-cffi-task-bench.log", data) +data = parsefile("udp-cython-task-bench.log", data) + +sns.set_theme(style="whitegrid") +df = data + +# Draw a pointplot to show pulse as a function of three categorical factors +g = sns.lineplot(x="Iterations", y="Elapsed Time", hue="file", data=df,ci=99, linewidth=3, markersize=10, alpha = 0.5) +g.set(ylabel='Elapsed Time') +g.set(xlabel='Iterations') +g.set_xscale('log', base=2) +g.set(title="cffi vs cython with the task-bench benchmark") +plt.subplots_adjust(bottom=0.15) +plt.savefig('cffi_vs_cython_task-bench.png') +print("Created cffi_vs_cython_task-bench.png") diff --git a/.tmp/udp-cffi-pingpong.log b/.tmp/udp-cffi-pingpong.log new file mode 100644 index 00000000..66d80ae1 --- /dev/null +++ b/.tmp/udp-cffi-pingpong.log @@ -0,0 +1,235 @@ +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.133 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-114-ge1a85fbae +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.001 seconds. +My PE: 0 +[0] neighbor = <__main__.PingArrayProxy object at 0x7ffff7517490> +My PE: 1 +[1] neighbor = <__main__.PingArrayProxy object at 0x7ffff7517d30> +Array length = 1 +RES:: [plain] 1 40.15648365020752 +Array length = 2 +RES:: [plain] 2 39.112091064453125 +Array length = 4 +RES:: [plain] 4 38.93387317657471 +Array length = 8 +RES:: [plain] 8 41.063129901885986 +Array length = 16 +RES:: [plain] 16 41.2411093711853 +Array length = 32 +RES:: [plain] 32 41.03374481201172 +Array length = 64 +RES:: [plain] 64 37.12218999862671 +Array length = 128 +RES:: [plain] 128 39.837539196014404 +Array length = 256 +RES:: [plain] 256 40.46440124511719 +Array length = 512 +RES:: [plain] 512 44.843971729278564 +Array length = 1024 +RES:: [plain] 1024 45.204877853393555 +Array length = 2048 +RES:: [plain] 2048 43.63143444061279 +Array length = 4096 +RES:: [plain] 4096 57.63280391693115 +Array length = 8192 +RES:: [plain] 8192 76.37739181518555 +Array length = 16384 +RES:: [plain] 16384 114.97676372528076 +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.138 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-114-ge1a85fbae +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.001 seconds. +My PE: 0 +[0] neighbor = <__main__.PingArrayProxy object at 0x7ffff7516490> +My PE: 1 +[1] neighbor = <__main__.PingArrayProxy object at 0x7ffff7516d30> +Array length = 1 +RES:: [plain] 1 39.1005277633667 +Array length = 2 +RES:: [plain] 2 38.743793964385986 +Array length = 4 +RES:: [plain] 4 43.201565742492676 +Array length = 8 +RES:: [plain] 8 38.9094352722168 +Array length = 16 +RES:: [plain] 16 41.18305444717407 +Array length = 32 +RES:: [plain] 32 40.7448410987854 +Array length = 64 +RES:: [plain] 64 42.01465845108032 +Array length = 128 +RES:: [plain] 128 45.90874910354614 +Array length = 256 +RES:: [plain] 256 45.57478427886963 +Array length = 512 +RES:: [plain] 512 50.64809322357178 +Array length = 1024 +RES:: [plain] 1024 55.482685565948486 +Array length = 2048 +RES:: [plain] 2048 49.16834831237793 +Array length = 4096 +RES:: [plain] 4096 54.6494722366333 +Array length = 8192 +RES:: [plain] 8192 82.3284387588501 +Array length = 16384 +RES:: [plain] 16384 112.06519603729248 +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.152 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-114-ge1a85fbae +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.001 seconds. +My PE: 0 +[0] neighbor = <__main__.PingArrayProxy object at 0x7ffff7516490> +My PE: 1 +[1] neighbor = <__main__.PingArrayProxy object at 0x7ffff7516d30> +Array length = 1 +RES:: [plain] 1 45.69602012634277 +Array length = 2 +RES:: [plain] 2 43.62159967422485 +Array length = 4 +RES:: [plain] 4 45.28534412384033 +Array length = 8 +RES:: [plain] 8 42.55479574203491 +Array length = 16 +RES:: [plain] 16 41.91511869430542 +Array length = 32 +RES:: [plain] 32 41.01341962814331 +Array length = 64 +RES:: [plain] 64 40.20887613296509 +Array length = 128 +RES:: [plain] 128 40.787458419799805 +Array length = 256 +RES:: [plain] 256 43.34598779678345 +Array length = 512 +RES:: [plain] 512 41.78130626678467 +Array length = 1024 +RES:: [plain] 1024 41.22030735015869 +Array length = 2048 +RES:: [plain] 2048 46.240150928497314 +Array length = 4096 +RES:: [plain] 4096 46.45693302154541 +Array length = 8192 +RES:: [plain] 8192 73.711097240448 +Array length = 16384 +RES:: [plain] 16384 95.82799673080444 +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.132 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-114-ge1a85fbae +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.001 seconds. +My PE: 0 +[0] neighbor = <__main__.PingArrayProxy object at 0x7ffff7516490> +My PE: 1 +[1] neighbor = <__main__.PingArrayProxy object at 0x7ffff7516d30> +Array length = 1 +RES:: [plain] 1 41.44507646560669 +Array length = 2 +RES:: [plain] 2 44.38084363937378 +Array length = 4 +RES:: [plain] 4 43.155670166015625 +Array length = 8 +RES:: [plain] 8 39.33978080749512 +Array length = 16 +RES:: [plain] 16 39.40623998641968 +Array length = 32 +RES:: [plain] 32 41.42165184020996 +Array length = 64 +RES:: [plain] 64 40.746212005615234 +Array length = 128 +RES:: [plain] 128 40.216803550720215 +Array length = 256 +RES:: [plain] 256 40.44085741043091 +Array length = 512 +RES:: [plain] 512 43.04009675979614 +Array length = 1024 +RES:: [plain] 1024 42.25742816925049 +Array length = 2048 +RES:: [plain] 2048 43.71988773345947 +Array length = 4096 +RES:: [plain] 4096 50.69148540496826 +Array length = 8192 +RES:: [plain] 8192 74.50670003890991 +Array length = 16384 +RES:: [plain] 16384 108.46364498138428 +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.149 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-114-ge1a85fbae +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.001 seconds. +My PE: 0 +[0] neighbor = <__main__.PingArrayProxy object at 0x7ffff7516490> +My PE: 1 +[1] neighbor = <__main__.PingArrayProxy object at 0x7ffff7516d30> +Array length = 1 +RES:: [plain] 1 63.12370300292969 +Array length = 2 +RES:: [plain] 2 44.25305128097534 +Array length = 4 +RES:: [plain] 4 47.86252975463867 +Array length = 8 +RES:: [plain] 8 49.58748817443848 +Array length = 16 +RES:: [plain] 16 45.912861824035645 +Array length = 32 +RES:: [plain] 32 52.34187841415405 +Array length = 64 +RES:: [plain] 64 45.03321647644043 +Array length = 128 +RES:: [plain] 128 45.86869478225708 +Array length = 256 +RES:: [plain] 256 47.442495822906494 +Array length = 512 +RES:: [plain] 512 46.772122383117676 +Array length = 1024 +RES:: [plain] 1024 48.66296052932739 +Array length = 2048 +RES:: [plain] 2048 49.2098331451416 +Array length = 4096 +RES:: [plain] 4096 52.45620012283325 +Array length = 8192 +RES:: [plain] 8192 78.18669080734253 +Array length = 16384 +RES:: [plain] 16384 103.56521606445312 +[Partition 0][Node 0] End of program diff --git a/.tmp/udp-cffi-task-bench.log b/.tmp/udp-cffi-task-bench.log new file mode 100644 index 00000000..d4758d91 --- /dev/null +++ b/.tmp/udp-cffi-task-bench.log @@ -0,0 +1,656 @@ +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.174 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 64 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.0097006599999645 +Time for last run: 0.0005995039999788787 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 132096 +Total Bytes 0 +Elapsed Time 5.995040e-04 seconds +FLOP/s 2.203421e+08 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.174 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 128 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.008754253999995854 +Time for last run: 0.0005851040000379726 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 263168 +Total Bytes 0 +Elapsed Time 5.851040e-04 seconds +FLOP/s 4.497799e+08 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.174 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 256 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.00867615399999977 +Time for last run: 0.0005850040000154877 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 525312 +Total Bytes 0 +Elapsed Time 5.850040e-04 seconds +FLOP/s 8.979631e+08 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.173 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 512 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.008673054000041702 +Time for last run: 0.0006215039999801775 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 1049600 +Total Bytes 0 +Elapsed Time 6.215040e-04 seconds +FLOP/s 1.688807e+09 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.173 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 1024 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.008711155000014514 +Time for last run: 0.000591804000009688 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 2098176 +Total Bytes 0 +Elapsed Time 5.918040e-04 seconds +FLOP/s 3.545390e+09 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.172 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 2048 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.008710554000003867 +Time for last run: 0.0006275039999650289 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 4195328 +Total Bytes 0 +Elapsed Time 6.275040e-04 seconds +FLOP/s 6.685739e+09 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.173 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 4096 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.008808055000031345 +Time for last run: 0.0006666049999921597 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 8389632 +Total Bytes 0 +Elapsed Time 6.666050e-04 seconds +FLOP/s 1.258561e+10 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.174 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 8192 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.008760154000015064 +Time for last run: 0.0007135039999752735 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 16778240 +Total Bytes 0 +Elapsed Time 7.135040e-04 seconds +FLOP/s 2.351527e+10 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.188 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 64 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.009781355000029635 +Time for last run: 0.0009580060000189405 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 132096 +Total Bytes 0 +Elapsed Time 9.580060e-04 seconds +FLOP/s 1.378864e+08 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.185 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 128 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.010087557000019842 +Time for last run: 0.0009556059999908939 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 263168 +Total Bytes 0 +Elapsed Time 9.556060e-04 seconds +FLOP/s 2.753938e+08 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.183 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 256 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.00945535400001063 +Time for last run: 0.0009060049999902731 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 525312 +Total Bytes 0 +Elapsed Time 9.060050e-04 seconds +FLOP/s 5.798114e+08 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.182 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.003 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 512 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.0095319540000105 +Time for last run: 0.0008442040000318229 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 1049600 +Total Bytes 0 +Elapsed Time 8.442040e-04 seconds +FLOP/s 1.243301e+09 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.176 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.008 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 1024 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.009661155000003419 +Time for last run: 0.0009981059999972786 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 2098176 +Total Bytes 0 +Elapsed Time 9.981060e-04 seconds +FLOP/s 2.102157e+09 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.188 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.004 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 2048 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.01043335899998965 +Time for last run: 0.0009620060000088415 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 4195328 +Total Bytes 0 +Elapsed Time 9.620060e-04 seconds +FLOP/s 4.361021e+09 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.187 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 4096 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.010313959000029627 +Time for last run: 0.0012098069999524341 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 8389632 +Total Bytes 0 +Elapsed Time 1.209807e-03 seconds +FLOP/s 6.934686e+09 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.194 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 8192 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.010317658000019492 +Time for last run: 0.001109207000013157 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 16778240 +Total Bytes 0 +Elapsed Time 1.109207e-03 seconds +FLOP/s 1.512634e+10 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program diff --git a/.tmp/udp-cython-pingpong.log b/.tmp/udp-cython-pingpong.log new file mode 100644 index 00000000..e5155a40 --- /dev/null +++ b/.tmp/udp-cython-pingpong.log @@ -0,0 +1,235 @@ +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.160 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-114-ge1a85fbae +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.001 seconds. +My PE: 0 +[0] neighbor = <__main__.PingArrayProxy object at 0x7ffff75ba610> +My PE: 1 +[1] neighbor = <__main__.PingArrayProxy object at 0x7ffff75e3970> +Array length = 1 +RES:: [plain] 1 43.888092041015625 +Array length = 2 +RES:: [plain] 2 43.76024007797241 +Array length = 4 +RES:: [plain] 4 43.49935054779053 +Array length = 8 +RES:: [plain] 8 44.67439651489258 +Array length = 16 +RES:: [plain] 16 47.05125093460083 +Array length = 32 +RES:: [plain] 32 47.47962951660156 +Array length = 64 +RES:: [plain] 64 46.534717082977295 +Array length = 128 +RES:: [plain] 128 47.661781311035156 +Array length = 256 +RES:: [plain] 256 47.47897386550903 +Array length = 512 +RES:: [plain] 512 48.288166522979736 +Array length = 1024 +RES:: [plain] 1024 50.57644844055176 +Array length = 2048 +RES:: [plain] 2048 54.69226837158203 +Array length = 4096 +RES:: [plain] 4096 57.62690305709839 +Array length = 8192 +RES:: [plain] 8192 84.42598581314087 +Array length = 16384 +RES:: [plain] 16384 106.35274648666382 +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.131 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-114-ge1a85fbae +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.001 seconds. +My PE: 0 +[0] neighbor = <__main__.PingArrayProxy object at 0x7ffff7516460> +My PE: 1 +[1] neighbor = <__main__.PingArrayProxy object at 0x7ffff7516d00> +Array length = 1 +RES:: [plain] 1 38.94263505935669 +Array length = 2 +RES:: [plain] 2 43.44892501831055 +Array length = 4 +RES:: [plain] 4 39.50059413909912 +Array length = 8 +RES:: [plain] 8 52.47640609741211 +Array length = 16 +RES:: [plain] 16 38.996338844299316 +Array length = 32 +RES:: [plain] 32 45.6354022026062 +Array length = 64 +RES:: [plain] 64 39.72911834716797 +Array length = 128 +RES:: [plain] 128 40.835022926330566 +Array length = 256 +RES:: [plain] 256 38.59835863113403 +Array length = 512 +RES:: [plain] 512 40.82232713699341 +Array length = 1024 +RES:: [plain] 1024 43.23399066925049 +Array length = 2048 +RES:: [plain] 2048 41.06622934341431 +Array length = 4096 +RES:: [plain] 4096 49.25590753555298 +Array length = 8192 +RES:: [plain] 8192 74.65660572052002 +Array length = 16384 +RES:: [plain] 16384 100.98874568939209 +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.134 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-114-ge1a85fbae +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.003 seconds. +My PE: 0 +[0] neighbor = <__main__.PingArrayProxy object at 0x7ffff7516460> +My PE: 1 +[1] neighbor = <__main__.PingArrayProxy object at 0x7ffff7516d00> +Array length = 1 +RES:: [plain] 1 39.76237773895264 +Array length = 2 +RES:: [plain] 2 40.10319709777832 +Array length = 4 +RES:: [plain] 4 40.839552879333496 +Array length = 8 +RES:: [plain] 8 37.33038902282715 +Array length = 16 +RES:: [plain] 16 40.25566577911377 +Array length = 32 +RES:: [plain] 32 41.12619161605835 +Array length = 64 +RES:: [plain] 64 38.93637657165527 +Array length = 128 +RES:: [plain] 128 41.21506214141846 +Array length = 256 +RES:: [plain] 256 38.67143392562866 +Array length = 512 +RES:: [plain] 512 42.545437812805176 +Array length = 1024 +RES:: [plain] 1024 49.97760057449341 +Array length = 2048 +RES:: [plain] 2048 48.38055372238159 +Array length = 4096 +RES:: [plain] 4096 53.62778902053833 +Array length = 8192 +RES:: [plain] 8192 84.78057384490967 +Array length = 16384 +RES:: [plain] 16384 116.44357442855835 +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.155 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-114-ge1a85fbae +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.001 seconds. +My PE: 0 +[0] neighbor = <__main__.PingArrayProxy object at 0x7ffff7516460> +My PE: 1 +[1] neighbor = <__main__.PingArrayProxy object at 0x7ffff7516d00> +Array length = 1 +RES:: [plain] 1 48.63160848617554 +Array length = 2 +RES:: [plain] 2 46.269893646240234 +Array length = 4 +RES:: [plain] 4 47.40065336227417 +Array length = 8 +RES:: [plain] 8 45.58128118515015 +Array length = 16 +RES:: [plain] 16 42.2205924987793 +Array length = 32 +RES:: [plain] 32 43.89834403991699 +Array length = 64 +RES:: [plain] 64 40.28666019439697 +Array length = 128 +RES:: [plain] 128 40.53777456283569 +Array length = 256 +RES:: [plain] 256 45.09943723678589 +Array length = 512 +RES:: [plain] 512 40.409207344055176 +Array length = 1024 +RES:: [plain] 1024 42.40846633911133 +Array length = 2048 +RES:: [plain] 2048 45.08596658706665 +Array length = 4096 +RES:: [plain] 4096 50.440847873687744 +Array length = 8192 +RES:: [plain] 8192 76.10297203063965 +Array length = 16384 +RES:: [plain] 16384 97.93758392333984 +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.137 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-114-ge1a85fbae +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.001 seconds. +My PE: 0 +[0] neighbor = <__main__.PingArrayProxy object at 0x7ffff7516460> +My PE: 1 +[1] neighbor = <__main__.PingArrayProxy object at 0x7ffff7516d00> +Array length = 1 +RES:: [plain] 1 37.310123443603516 +Array length = 2 +RES:: [plain] 2 38.25861215591431 +Array length = 4 +RES:: [plain] 4 40.13127088546753 +Array length = 8 +RES:: [plain] 8 40.22252559661865 +Array length = 16 +RES:: [plain] 16 39.460837841033936 +Array length = 32 +RES:: [plain] 32 38.31470012664795 +Array length = 64 +RES:: [plain] 64 40.29017686843872 +Array length = 128 +RES:: [plain] 128 38.038671016693115 +Array length = 256 +RES:: [plain] 256 37.92917728424072 +Array length = 512 +RES:: [plain] 512 40.43787717819214 +Array length = 1024 +RES:: [plain] 1024 42.65230894088745 +Array length = 2048 +RES:: [plain] 2048 43.689846992492676 +Array length = 4096 +RES:: [plain] 4096 48.7285852432251 +Array length = 8192 +RES:: [plain] 8192 73.81588220596313 +Array length = 16384 +RES:: [plain] 16384 101.6358733177185 +[Partition 0][Node 0] End of program diff --git a/.tmp/udp-cython-task-bench.log b/.tmp/udp-cython-task-bench.log new file mode 100644 index 00000000..c2b7a9dd --- /dev/null +++ b/.tmp/udp-cython-task-bench.log @@ -0,0 +1,656 @@ +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.201 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 64 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.008624055000041153 +Time for last run: 0.0005591039999899294 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 132096 +Total Bytes 0 +Elapsed Time 5.591040e-04 seconds +FLOP/s 2.362637e+08 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.175 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 128 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.008715055999971355 +Time for last run: 0.0005877039999973022 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 263168 +Total Bytes 0 +Elapsed Time 5.877040e-04 seconds +FLOP/s 4.477900e+08 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.175 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 256 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.00856105499997284 +Time for last run: 0.0006010039999750916 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 525312 +Total Bytes 0 +Elapsed Time 6.010040e-04 seconds +FLOP/s 8.740574e+08 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.174 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 512 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.008673455999996804 +Time for last run: 0.0006210040000382833 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 1049600 +Total Bytes 0 +Elapsed Time 6.210040e-04 seconds +FLOP/s 1.690166e+09 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.175 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 1024 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.008787357000016982 +Time for last run: 0.0006138039999541434 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 2098176 +Total Bytes 0 +Elapsed Time 6.138040e-04 seconds +FLOP/s 3.418316e+09 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.173 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 2048 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.008677954999996018 +Time for last run: 0.0005995040000357221 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 4195328 +Total Bytes 0 +Elapsed Time 5.995040e-04 seconds +FLOP/s 6.997998e+09 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.172 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 4096 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.008611556000005294 +Time for last run: 0.0006403040000009241 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 8389632 +Total Bytes 0 +Elapsed Time 6.403040e-04 seconds +FLOP/s 1.310258e+10 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.173 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 8192 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.008716956000000664 +Time for last run: 0.0006808040000123583 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 16778240 +Total Bytes 0 +Elapsed Time 6.808040e-04 seconds +FLOP/s 2.464474e+10 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.212 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 64 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.011686076999978923 +Time for last run: 0.001018707000014274 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 132096 +Total Bytes 0 +Elapsed Time 1.018707e-03 seconds +FLOP/s 1.296703e+08 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.194 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 128 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.009927465000032498 +Time for last run: 0.000985606000028838 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 263168 +Total Bytes 0 +Elapsed Time 9.856060e-04 seconds +FLOP/s 2.670114e+08 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.192 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 256 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.010256968000021516 +Time for last run: 0.001045707000002949 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 525312 +Total Bytes 0 +Elapsed Time 1.045707e-03 seconds +FLOP/s 5.023510e+08 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.193 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 512 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.010232867000013357 +Time for last run: 0.0010051070000258733 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 1049600 +Total Bytes 0 +Elapsed Time 1.005107e-03 seconds +FLOP/s 1.044267e+09 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.181 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 1024 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.009402862000001733 +Time for last run: 0.0009570059999646219 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 2098176 +Total Bytes 0 +Elapsed Time 9.570060e-04 seconds +FLOP/s 2.192438e+09 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.179 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 2048 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.009789064000017333 +Time for last run: 0.00100620699998899 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 4195328 +Total Bytes 0 +Elapsed Time 1.006207e-03 seconds +FLOP/s 4.169448e+09 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.190 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 4096 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.015119499999968866 +Time for last run: 0.0009688070000493099 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 8389632 +Total Bytes 0 +Elapsed Time 9.688070e-04 seconds +FLOP/s 8.659756e+09 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program +Charmrun> scalable start enabled. +Charmrun> started all node programs in 0.181 seconds. +Charm++> Running in non-SMP mode: 2 processes (PEs) +Converse/Charm++ Commit ID: v7.1.0-devel-115-ga4b8220a7 +Charm++ built with internal error checking enabled. +Do not use for performance benchmarking (build without --enable-error-checking to do so). +Isomalloc> Synchronized global address space. +Charm++> scheduler running in netpoll mode. +CharmLB> Load balancer assumes all CPUs are same. +Charm4py> Running Charm4py version 1.0 on Python 3.8.10 (CPython). Using 'cython' interface to access Charm++ +Charm++> Running on 1 hosts (1 sockets x 2 cores x 1 PUs = 2-way SMP) +Charm++> cpu topology info is gathered in 0.000 seconds. +Running Task Benchmark + Configuration: + Task Graph 1: + Time Steps: 4 + Max Width: 4 + Dependence Type: stencil_1d + Radix: 3 + Period: 0 + Fraction Connected: 0.250000 + Kernel: + Type: compute_bound + Iterations: 8192 + Samples: 16 + Imbalance: 0.000000 + Output Bytes: 16 + Scratch Bytes: 0 +Time for last run: 0.009021359000030316 +Time for last run: 0.0011290080000208036 +Total Tasks 16 +Total Dependencies 30 + Unable to estimate local/nonlocal dependencies +Total FLOPs 16778240 +Total Bytes 0 +Elapsed Time 1.129008e-03 seconds +FLOP/s 1.486105e+10 +B/s 0.000000e+00 +Transfer (estimated): + Unable to estimate local/nonlocal transfer +[Partition 0][Node 0] End of program