Skip to content

Commit bc47061

Browse files
authored
Added Gloo as a backend option (#203)
* Added Gloo as a backend option * ran prettier and changed backend from nccl * corrected code for adding Gloo as a backend
1 parent 1e7abf8 commit bc47061

File tree

7 files changed

+67
-52
lines changed

7 files changed

+67
-52
lines changed

src/components/TabTraining.vue

+12-2
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,16 @@
66
:label="deterministic.description"
77
:saveKey="deterministic.name"
88
/>
9-
<h2 class="training">Distributed Training (NCCL backend)</h2>
9+
<h2 class="training">Distributed Training</h2>
1010
<FormCheckbox label="Use distributed training" saveKey="use_dist" />
1111
<div v-show="store.config.use_dist">
12+
<h2>Choose a Backend</h2>
13+
<FormSelect
14+
required
15+
:saveKey="backend.name"
16+
:label="backend.description"
17+
:options="backend.options"
18+
/>
1219
<FormRadio :options="[launch, spawn]" saveKey="dist" defaultV="launch" />
1320
<FormInput
1421
:label="nproc_per_node.description"
@@ -46,13 +53,15 @@ import { training } from '../metadata/metadata.json'
4653
import FormCheckbox from './FormCheckbox.vue'
4754
import FormInput from './FormInput.vue'
4855
import FormRadio from './FormRadio.vue'
56+
import FormSelect from './FormSelect.vue'
4957
import { store } from '../store.js'
5058
5159
export default {
52-
components: { FormCheckbox, FormInput, FormRadio },
60+
components: { FormCheckbox, FormInput, FormRadio, FormSelect },
5361
setup() {
5462
const {
5563
deterministic,
64+
backend,
5665
launch,
5766
spawn,
5867
nproc_per_node,
@@ -69,6 +78,7 @@ export default {
6978
return {
7079
store,
7180
deterministic,
81+
backend,
7282
launch,
7383
spawn,
7484
nproc_per_node,

src/metadata/metadata.json

+5
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,11 @@
4141
"description": "Master node port for torch native backends (mandatory if you have filled number of nodes)",
4242
"min": 0,
4343
"default": 8080
44+
},
45+
"backend": {
46+
"name": "backend",
47+
"description": "Choose a Backend",
48+
"options": ["NCCL", "Gloo"]
4449
}
4550
},
4651
"handlers": {

src/templates/template-common/README.md

+10-10
Original file line numberDiff line numberDiff line change
@@ -9,26 +9,26 @@
99

1010
```sh
1111
python -m torch.distributed.launch \
12-
--nproc_per_node #:::= nproc_per_node :::# \
12+
--nproc_per_node #:::= it.nproc_per_node :::# \
1313
--nnodes #:::= it.nnodes :::# \
1414
--node_rank 0 \
1515
--master_addr #:::= it.master_addr :::# \
1616
--master_port #:::= it.master_port :::# \
1717
--use_env main.py \
18-
--backend nccl
18+
--backend #:::= it.backend :::#
1919
```
2020

2121
- Execute on worker nodes
2222

2323
```sh
2424
python -m torch.distributed.launch \
25-
--nproc_per_node #:::= nproc_per_node :::# \
25+
--nproc_per_node #:::= it.nproc_per_node :::# \
2626
--nnodes #:::= it.nnodes :::# \
2727
--node_rank <node_rank> \
2828
--master_addr #:::= it.master_addr :::# \
2929
--master_port #:::= it.master_port :::# \
3030
--use_env main.py \
31-
--backend nccl
31+
--backend #:::= it.backend :::#
3232
```
3333

3434
#::: } else { :::#
@@ -39,7 +39,7 @@ python -m torch.distributed.launch \
3939
python -m torch.distributed.launch \
4040
--nproc_per_node #:::= it.nproc_per_node :::# \
4141
--use_env main.py \
42-
--backend nccl
42+
--backend #:::= it.backend :::#
4343
```
4444

4545
#::: } :::#
@@ -56,24 +56,24 @@ python -m torch.distributed.launch \
5656

5757
```sh
5858
python main.py \
59-
--nproc_per_node #:::= nproc_per_node :::# \
59+
--nproc_per_node #:::= it.nproc_per_node :::# \
6060
--nnodes #:::= it.nnodes :::# \
6161
--node_rank 0 \
6262
--master_addr #:::= it.master_addr :::# \
6363
--master_port #:::= it.master_port :::# \
64-
--backend nccl
64+
--backend #:::= it.backend :::#
6565
```
6666

6767
- Execute on worker nodes
6868

6969
```sh
7070
python main.py \
71-
--nproc_per_node #:::= nproc_per_node :::# \
71+
--nproc_per_node #:::= it.nproc_per_node :::# \
7272
--nnodes #:::= it.nnodes :::# \
7373
--node_rank <node_rank> \
7474
--master_addr #:::= it.master_addr :::# \
7575
--master_port #:::= it.master_port :::# \
76-
--backend nccl
76+
--backend #:::= it.backend :::#
7777
```
7878

7979
#::: } else { :::#
@@ -83,7 +83,7 @@ python main.py \
8383
```sh
8484
python main.py \
8585
--nproc_per_node #:::= it.nproc_per_node :::# \
86-
--backend nccl
86+
--backend #:::= it.backend :::#
8787
```
8888

8989
#::: } :::#

src/templates/template-text-classification/README.md

+10-10
Original file line numberDiff line numberDiff line change
@@ -25,26 +25,26 @@ pip install -r requirements.txt --progress-bar off -U
2525

2626
```sh
2727
python -m torch.distributed.launch \
28-
--nproc_per_node #:::= nproc_per_node :::# \
28+
--nproc_per_node #:::= it.nproc_per_node :::# \
2929
--nnodes #:::= it.nnodes :::# \
3030
--node_rank 0 \
3131
--master_addr #:::= it.master_addr :::# \
3232
--master_port #:::= it.master_port :::# \
3333
--use_env main.py \
34-
--backend nccl
34+
--backend #:::= it.backend :::#
3535
```
3636

3737
- Execute on worker nodes
3838

3939
```sh
4040
python -m torch.distributed.launch \
41-
--nproc_per_node #:::= nproc_per_node :::# \
41+
--nproc_per_node #:::= it.nproc_per_node :::# \
4242
--nnodes #:::= it.nnodes :::# \
4343
--node_rank <node_rank> \
4444
--master_addr #:::= it.master_addr :::# \
4545
--master_port #:::= it.master_port :::# \
4646
--use_env main.py \
47-
--backend nccl
47+
--backend #:::= it.backend :::#
4848
```
4949

5050
#::: } else { :::#
@@ -55,7 +55,7 @@ python -m torch.distributed.launch \
5555
python -m torch.distributed.launch \
5656
--nproc_per_node #:::= it.nproc_per_node :::# \
5757
--use_env main.py \
58-
--backend nccl
58+
--backend #:::= it.backend :::#
5959
```
6060

6161
#::: } :::#
@@ -72,24 +72,24 @@ python -m torch.distributed.launch \
7272

7373
```sh
7474
python main.py \
75-
--nproc_per_node #:::= nproc_per_node :::# \
75+
--nproc_per_node #:::= it.nproc_per_node :::# \
7676
--nnodes #:::= it.nnodes :::# \
7777
--node_rank 0 \
7878
--master_addr #:::= it.master_addr :::# \
7979
--master_port #:::= it.master_port :::# \
80-
--backend nccl
80+
--backend #:::= it.backend :::#
8181
```
8282

8383
- Execute on worker nodes
8484

8585
```sh
8686
python main.py \
87-
--nproc_per_node #:::= nproc_per_node :::# \
87+
--nproc_per_node #:::= it.nproc_per_node :::# \
8888
--nnodes #:::= it.nnodes :::# \
8989
--node_rank <node_rank> \
9090
--master_addr #:::= it.master_addr :::# \
9191
--master_port #:::= it.master_port :::# \
92-
--backend nccl
92+
--backend #:::= it.backend :::#
9393
```
9494

9595
#::: } else { :::#
@@ -99,7 +99,7 @@ python main.py \
9999
```sh
100100
python main.py \
101101
--nproc_per_node #:::= it.nproc_per_node :::# \
102-
--backend nccl
102+
--backend #:::= it.backend :::#
103103
```
104104

105105
#::: } :::#

src/templates/template-vision-classification/README.md

+10-10
Original file line numberDiff line numberDiff line change
@@ -25,26 +25,26 @@ pip install -r requirements.txt --progress-bar off -U
2525

2626
```sh
2727
python -m torch.distributed.launch \
28-
--nproc_per_node #:::= nproc_per_node :::# \
28+
--nproc_per_node #:::= it.nproc_per_node :::# \
2929
--nnodes #:::= it.nnodes :::# \
3030
--node_rank 0 \
3131
--master_addr #:::= it.master_addr :::# \
3232
--master_port #:::= it.master_port :::# \
3333
--use_env main.py \
34-
--backend nccl
34+
--backend #:::= it.backend :::#
3535
```
3636

3737
- Execute on worker nodes
3838

3939
```sh
4040
python -m torch.distributed.launch \
41-
--nproc_per_node #:::= nproc_per_node :::# \
41+
--nproc_per_node #:::= it.nproc_per_node :::# \
4242
--nnodes #:::= it.nnodes :::# \
4343
--node_rank <node_rank> \
4444
--master_addr #:::= it.master_addr :::# \
4545
--master_port #:::= it.master_port :::# \
4646
--use_env main.py \
47-
--backend nccl
47+
--backend #:::= it.backend :::#
4848
```
4949

5050
#::: } else { :::#
@@ -55,7 +55,7 @@ python -m torch.distributed.launch \
5555
python -m torch.distributed.launch \
5656
--nproc_per_node #:::= it.nproc_per_node :::# \
5757
--use_env main.py \
58-
--backend nccl
58+
--backend #:::= it.backend :::#
5959
```
6060

6161
#::: } :::#
@@ -72,24 +72,24 @@ python -m torch.distributed.launch \
7272

7373
```sh
7474
python main.py \
75-
--nproc_per_node #:::= nproc_per_node :::# \
75+
--nproc_per_node #:::= it.nproc_per_node :::# \
7676
--nnodes #:::= it.nnodes :::# \
7777
--node_rank 0 \
7878
--master_addr #:::= it.master_addr :::# \
7979
--master_port #:::= it.master_port :::# \
80-
--backend nccl
80+
--backend #:::= it.backend :::#
8181
```
8282

8383
- Execute on worker nodes
8484

8585
```sh
8686
python main.py \
87-
--nproc_per_node #:::= nproc_per_node :::# \
87+
--nproc_per_node #:::= it.nproc_per_node :::# \
8888
--nnodes #:::= it.nnodes :::# \
8989
--node_rank <node_rank> \
9090
--master_addr #:::= it.master_addr :::# \
9191
--master_port #:::= it.master_port :::# \
92-
--backend nccl
92+
--backend #:::= it.backend :::#
9393
```
9494

9595
#::: } else { :::#
@@ -99,7 +99,7 @@ python main.py \
9999
```sh
100100
python main.py \
101101
--nproc_per_node #:::= it.nproc_per_node :::# \
102-
--backend nccl
102+
--backend #:::= it.backend :::#
103103
```
104104

105105
#::: } :::#

src/templates/template-vision-dcgan/README.md

+10-10
Original file line numberDiff line numberDiff line change
@@ -25,26 +25,26 @@ pip install -r requirements.txt --progress-bar off -U
2525

2626
```sh
2727
python -m torch.distributed.launch \
28-
--nproc_per_node #:::= nproc_per_node :::# \
28+
--nproc_per_node #:::= it.nproc_per_node :::# \
2929
--nnodes #:::= it.nnodes :::# \
3030
--node_rank 0 \
3131
--master_addr #:::= it.master_addr :::# \
3232
--master_port #:::= it.master_port :::# \
3333
--use_env main.py \
34-
--backend nccl
34+
--backend #:::= it.backend :::#
3535
```
3636

3737
- Execute on worker nodes
3838

3939
```sh
4040
python -m torch.distributed.launch \
41-
--nproc_per_node #:::= nproc_per_node :::# \
41+
--nproc_per_node #:::= it.nproc_per_node :::# \
4242
--nnodes #:::= it.nnodes :::# \
4343
--node_rank <node_rank> \
4444
--master_addr #:::= it.master_addr :::# \
4545
--master_port #:::= it.master_port :::# \
4646
--use_env main.py \
47-
--backend nccl
47+
--backend #:::= it.backend :::#
4848
```
4949

5050
#::: } else { :::#
@@ -55,7 +55,7 @@ python -m torch.distributed.launch \
5555
python -m torch.distributed.launch \
5656
--nproc_per_node #:::= it.nproc_per_node :::# \
5757
--use_env main.py \
58-
--backend nccl
58+
--backend #:::= it.backend :::#
5959
```
6060

6161
#::: } :::#
@@ -72,24 +72,24 @@ python -m torch.distributed.launch \
7272

7373
```sh
7474
python main.py \
75-
--nproc_per_node #:::= nproc_per_node :::# \
75+
--nproc_per_node #:::= it.nproc_per_node :::# \
7676
--nnodes #:::= it.nnodes :::# \
7777
--node_rank 0 \
7878
--master_addr #:::= it.master_addr :::# \
7979
--master_port #:::= it.master_port :::# \
80-
--backend nccl
80+
--backend #:::= it.backend :::#
8181
```
8282

8383
- Execute on worker nodes
8484

8585
```sh
8686
python main.py \
87-
--nproc_per_node #:::= nproc_per_node :::# \
87+
--nproc_per_node #:::= it.nproc_per_node :::# \
8888
--nnodes #:::= it.nnodes :::# \
8989
--node_rank <node_rank> \
9090
--master_addr #:::= it.master_addr :::# \
9191
--master_port #:::= it.master_port :::# \
92-
--backend nccl
92+
--backend #:::= it.backend :::#
9393
```
9494

9595
#::: } else { :::#
@@ -99,7 +99,7 @@ python main.py \
9999
```sh
100100
python main.py \
101101
--nproc_per_node #:::= it.nproc_per_node :::# \
102-
--backend nccl
102+
--backend #:::= it.backend :::#
103103
```
104104

105105
#::: } :::#

0 commit comments

Comments
 (0)