Skip to content

Commit c1d9719

Browse files
committed
phase scheduling
1 parent f649215 commit c1d9719

11 files changed

+277
-35
lines changed

CPPLINT.cfg

100644100755
File mode changed.

Makefile

100644100755
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ BFS ?= 0
1717
D0_BSP ?= 0
1818
D1_PRIO ?= 0
1919
D1_CHUNK ?= 0
20+
D1_PHASE ?= 0
21+
DISTANCE ?= 1
2022

2123
DIST_UNIFORM ?= 1
2224

scripts/benchmark_hilbert.sh

100644100755
Lines changed: 52 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -23,47 +23,70 @@ echo " $originalnodes" >>$output
2323
echo " $originaledges" >>$output
2424
echo "" >>$output
2525

26-
rounds=10
26+
rounds=3
2727

2828
# benchmark the unordered input, parallel and not
29-
parallel=0 ; while [[ $parallel -le 1 ]] ; do
30-
(make TMP=$benchroot clean-graph-compute) 2>&1 >/dev/null ;
31-
(make TMP=$benchroot BASELINE=1 PARALLEL=$parallel build-graph-compute) #2>&1 >/dev/null
32-
echo ""
33-
echo "Running original data, baseline, parallel=$parallel"
34-
echo ""
35-
make TMP=$benchroot ROUNDS=$rounds OUTPUT=$output ORIGINAL_NODES_FILE=$originalnodes ORIGINAL_EDGES_FILE=$originaledges run-original-concat ;
36-
echo "" >>$output
37-
38-
(make TMP=$benchroot clean-graph-compute) 2>&1 >/dev/null ;
39-
(make TMP=$benchroot D0_BSP=1 PARALLEL=$parallel build-graph-compute) #2>&1 >/dev/null
40-
echo ""
41-
echo "Running original data, d0-bsp, parallel=$parallel"
42-
echo ""
43-
make TMP=$benchroot ROUNDS=$rounds OUTPUT=$output ORIGINAL_NODES_FILE=$originalnodes ORIGINAL_EDGES_FILE=$originaledges run-original-concat ;
44-
echo "" >>$output
45-
46-
(make TMP=$benchroot clean-graph-compute) 2>&1 >/dev/null ;
47-
(make TMP=$benchroot D1_CHUNK=1 CHUNK_BITS=$chunkbits PARALLEL=$parallel build-graph-compute) #2>&1 >/dev/null
48-
echo ""
49-
echo "Running original data, chunk ($chunkbits bits), parallel=$parallel"
50-
echo ""
51-
make TMP=$benchroot ROUNDS=$rounds OUTPUT=$output ORIGINAL_NODES_FILE=$originalnodes ORIGINAL_EDGES_FILE=$originaledges run-original-concat ;
52-
echo "" >>$output
53-
54-
((parallel = $parallel + 1)) ;
55-
done ;
29+
# parallel=0 ; while [[ $parallel -le 1 ]] ; do
30+
# distance=0 ; while [[ $distance -le 2 ]] ; do
31+
# (make TMP=$benchroot clean-graph-compute) 2>&1 >/dev/null ;
32+
# (make TMP=$benchroot D1_PHASE=1 DISTANCE=$distance CHUNK_BITS=$chunkbits PARALLEL=$parallel build-graph-compute) #2>&1 >/dev/null
33+
# echo ""
34+
# echo "Running original data, chunk ($chunkbits bits), parallel=$parallel"
35+
# echo ""
36+
# make TMP=$benchroot ROUNDS=$rounds OUTPUT=$output ORIGINAL_NODES_FILE=$originalnodes ORIGINAL_EDGES_FILE=$originaledges run-original-concat ;
37+
# echo "" >>$output
38+
# ((distance = $distance + 1)) ;
39+
# done ;
40+
41+
# (make TMP=$benchroot clean-graph-compute) 2>&1 >/dev/null ;
42+
# (make TMP=$benchroot BASELINE=1 PARALLEL=$parallel build-graph-compute) #2>&1 >/dev/null
43+
# echo ""
44+
# echo "Running original data, baseline, parallel=$parallel"
45+
# echo ""
46+
# make TMP=$benchroot ROUNDS=$rounds OUTPUT=$output ORIGINAL_NODES_FILE=$originalnodes ORIGINAL_EDGES_FILE=$originaledges run-original-concat ;
47+
# echo "" >>$output
48+
49+
# (make TMP=$benchroot clean-graph-compute) 2>&1 >/dev/null ;
50+
# (make TMP=$benchroot D0_BSP=1 PARALLEL=$parallel build-graph-compute) #2>&1 >/dev/null
51+
# echo ""
52+
# echo "Running original data, d0-bsp, parallel=$parallel"
53+
# echo ""
54+
# make TMP=$benchroot ROUNDS=$rounds OUTPUT=$output ORIGINAL_NODES_FILE=$originalnodes ORIGINAL_EDGES_FILE=$originaledges run-original-concat ;
55+
# echo "" >>$output
56+
57+
# (make TMP=$benchroot clean-graph-compute) 2>&1 >/dev/null ;
58+
# (make TMP=$benchroot D1_CHUNK=1 CHUNK_BITS=$chunkbits PARALLEL=$parallel build-graph-compute) #2>&1 >/dev/null
59+
# echo ""
60+
# echo "Running original data, chunk ($chunkbits bits), parallel=$parallel"
61+
# echo ""
62+
# make TMP=$benchroot ROUNDS=$rounds OUTPUT=$output ORIGINAL_NODES_FILE=$originalnodes ORIGINAL_EDGES_FILE=$originaledges run-original-concat ;
63+
# echo "" >>$output
64+
65+
# ((parallel = $parallel + 1)) ;
66+
# done ;
5667

5768
# for each Hilbert granularity,
5869
# benchmark the baseline code, the fake BSP, and the best optimized one,
5970
# both in parallel and in series
60-
hilbert=1 ; while [[ $hilbert -le 9 ]] ; do
71+
hilbert=7 ; while [[ $hilbert -le 8 ]] ; do
6172
echo "Reordering with $hilbert Hilbert bits per dimension"
6273

6374
(make TMP=$benchroot clean-hilbert-reorder) 2>&1 >/dev/null;
6475
make TMP=$benchroot PARALLEL=1 HILBERTBITS=$hilbert ORIGINAL_NODES_FILE=$originalnodes ORIGINAL_EDGES_FILE=$originaledges reorder-graph ;
6576

6677
parallel=0 ; while [[ $parallel -le 1 ]] ; do
78+
distance=0 ; while [[ $distance -le 2 ]] ; do
79+
(make TMP=$benchroot clean-graph-compute) 2>&1 >/dev/null ;
80+
(make TMP=$benchroot D1_PHASE=1 DISTANCE=$distance CHUNK_BITS=$chunkbits PARALLEL=$parallel build-graph-compute) #2>&1 >/dev/null
81+
echo ""
82+
echo "Running reordered (hilbert=$hilbert) data, chunk ($chunkbits bits), parallel=$parallel"
83+
echo ""
84+
make TMP=$benchroot ROUNDS=$rounds OUTPUT=$output run-reordered-concat ;
85+
echo "Hilbert bits: $hilbert" >>$output;
86+
echo "" >>$output
87+
((distance = $distance + 1)) ;
88+
done ;
89+
6790
(make TMP=$benchroot clean-graph-compute) 2>&1 >/dev/null ;
6891
(make TMP=$benchroot BASELINE=1 PARALLEL=$parallel build-graph-compute) #2>&1 >/dev/null
6992
echo ""

scripts/benchmark_scalability.sh

100644100755
File mode changed.

scripts/distance_distribution.py

100644100755
File mode changed.

src/graph_compute/Makefile

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ CC ?= gcc
22
CXX ?= g++
33
CFLAGS = -O3 -Wall
44
CXXFLAGS = -fcilkplus -std=c++11 -O3 -Wall -m64
5-
LDFLAGS = -lcilkrts -ldl -lrt
5+
LDFLAGS = -L/usr/lib64 -lcilkrts -ldl -lrt
66
ROOT = ../../
77

88
HEADERS = common.h io.h
@@ -32,10 +32,18 @@ ifneq ($(D1_CHUNK),)
3232
DEFS += -DD1_CHUNK=$(D1_CHUNK)
3333
endif
3434

35+
ifneq ($(D1_PHASE),)
36+
DEFS += -DD1_PHASE=$(D1_PHASE)
37+
endif
38+
3539
ifneq ($(CHUNK_BITS),)
3640
DEFS += -DCHUNK_BITS=$(CHUNK_BITS)
3741
endif
3842

43+
ifneq ($(DISTANCE),)
44+
DEFS += -DDISTANCE=$(DISTANCE)
45+
endif
46+
3947
ifneq ($(BASELINE),)
4048
DEFS += -DBASELINE=$(BASELINE)
4149
endif

src/graph_compute/chunk_scheduling.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,16 +44,20 @@ static inline bool chunkDependency(vid_t v, vid_t w) {
4444

4545
static void calculateNodeDependenciesChunk(vertex_t * const nodes,
4646
const vid_t cntNodes) {
47-
cilk_for (vid_t i = 0; i < cntNodes; i++) {
47+
vid_t cntDependencies = 0;
48+
for (vid_t i = 0; i < cntNodes; i++) {
4849
vertex_t * node = &nodes[i];
4950
node->dependencies = 0;
5051
for (vid_t j = 0; j < node->cntEdges; j++) {
5152
if (interChunkDependency(node->edges[j], i)) {
5253
++node->dependencies;
54+
cntDependencies++;
5355
}
5456
}
5557
node->satisfied = node->dependencies;
5658
}
59+
printf("InterChunkDependencies: %lu\n",
60+
static_cast<uint64_t>(cntDependencies));
5761
}
5862

5963
// for each node, move inter-chunk successors to the front of the edges list

src/graph_compute/common.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,18 @@
5454
#define D1_CHUNK 0
5555
#endif
5656

57+
#ifndef D1_PHASE
58+
#define D1_PHASE 0
59+
#endif
60+
5761
#ifndef PARALLEL
5862
#define PARALLEL 0
5963
#endif
6064

65+
#ifndef DISTANCE
66+
#define DISTANCE 1
67+
#endif
68+
6169
#if PARALLEL
6270
#include <cilk/cilk.h>
6371
#include <cilk/cilk_api.h>

src/graph_compute/compute.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ using namespace std;
1919
#include "./priority_scheduling.h"
2020
#elif D1_CHUNK
2121
#include "./chunk_scheduling.h"
22+
#elif D1_PHASE
23+
#include "./phase_scheduling.h"
2224
#elif D0_BSP
2325
#include "./bsp_scheduling.h"
2426
#else
@@ -126,7 +128,9 @@ int main(int argc, char *argv[]) {
126128
cout << "D0_BSP: " << D0_BSP << '\n';
127129
cout << "D1_PRIO: " << D1_PRIO << '\n';
128130
cout << "D1_CHUNK: " << D1_CHUNK << '\n';
131+
cout << "D1_PHASE: " << D1_PHASE << '\n';
129132
cout << "Parallel: " << PARALLEL << '\n';
133+
cout << "Distance: " << DISTANCE << '\n';
130134

131135
print_execution_data();
132136

src/graph_compute/phase_scheduling.h

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
#ifndef PHASE_SCHEDULING_H_
2+
#define PHASE_SCHEDULING_H_
3+
4+
#if D1_PHASE
5+
6+
#include <algorithm>
7+
#include <unordered_set>
8+
#include "./common.h"
9+
#include "./update_function.h"
10+
11+
#ifndef CHUNK_BITS
12+
#define CHUNK_BITS 16
13+
#endif
14+
15+
struct chunkdata_t {
16+
vid_t nextIndex; // the next vertex in this chunk to be processed
17+
vid_t phaseEndIndex[2]; // the index of the first vertex beyond this chunk
18+
};
19+
typedef struct chunkdata_t chunkdata_t;
20+
21+
struct scheddata_t {
22+
vid_t * dependentEdges;
23+
vid_t * dependentEdgeIndex;
24+
vid_t * cntDependentEdges;
25+
chunkdata_t * chunkdata;
26+
vid_t cntChunks;
27+
};
28+
typedef struct scheddata_t scheddata_t;
29+
30+
static inline bool samePhase(vid_t v, vid_t w, chunkdata_t * const chunkdata) {
31+
// this assumes that the boundary between phases is the
32+
// midpoint of the chunk - we will generalize later
33+
// to a custom boundary to optimize overall work
34+
bool vPhase = v < chunkdata[v >> CHUNK_BITS].phaseEndIndex[0];
35+
bool wPhase = w < chunkdata[w >> CHUNK_BITS].phaseEndIndex[0];
36+
return (vPhase == wPhase);
37+
}
38+
39+
static inline bool interChunkDependency(vid_t v, vid_t w) {
40+
static const vid_t chunkMask = (1 << CHUNK_BITS) - 1;
41+
if ((v >> CHUNK_BITS) == (w >> CHUNK_BITS)) {
42+
return false;
43+
} else {
44+
return ((v & chunkMask) < (w & chunkMask));
45+
}
46+
}
47+
48+
static void calculateNeighborhood(std::unordered_set<vid_t> * neighbors,
49+
std::unordered_set<vid_t> * oldNeighbors,
50+
vid_t v,
51+
vertex_t * const nodes, vid_t distance) {
52+
neighbors->clear();
53+
neighbors->insert(v);
54+
oldNeighbors->clear();
55+
for (vid_t d = 0; d < distance; d++) {
56+
*oldNeighbors = *neighbors;
57+
for (const auto& v : *oldNeighbors) {
58+
for (vid_t j = 0; j < nodes[v].cntEdges; j++) {
59+
if (oldNeighbors->count(nodes[v].edges[j]) == 0) {
60+
neighbors->insert(nodes[v].edges[j]);
61+
}
62+
}
63+
}
64+
}
65+
}
66+
67+
static void calculateNodeDependenciesChunk(vertex_t * const nodes,
68+
const vid_t cntNodes,
69+
scheddata_t * const sched) {
70+
sched->dependentEdgeIndex = new (std::nothrow) vid_t[cntNodes];
71+
sched->cntDependentEdges = new (std::nothrow) vid_t[cntNodes];
72+
vid_t cntDependencies = 0;
73+
std::unordered_set<vid_t> neighbors;
74+
neighbors.reserve(1024);
75+
std::unordered_set<vid_t> oldNeighbors;
76+
oldNeighbors.reserve(1024);
77+
for (vid_t i = 0; i < cntNodes; i++) {
78+
calculateNeighborhood(&neighbors, &oldNeighbors, i, nodes, DISTANCE);
79+
sched->dependentEdgeIndex[i] = cntDependencies;
80+
vertex_t * node = &nodes[i];
81+
node->dependencies = 0;
82+
vid_t outDep = 0;
83+
for (const auto& nbr : neighbors) {
84+
if (samePhase(nbr, i, sched->chunkdata)) {
85+
if (interChunkDependency(nbr, i)) {
86+
++node->dependencies;
87+
} else if (interChunkDependency(i, nbr)) {
88+
cntDependencies++;
89+
outDep++;
90+
}
91+
}
92+
}
93+
node->satisfied = node->dependencies;
94+
sched->cntDependentEdges[i] = outDep;
95+
}
96+
printf("InterChunkDependencies: %lu\n",
97+
static_cast<uint64_t>(cntDependencies));
98+
sched->dependentEdges = new (std::nothrow) vid_t[cntDependencies+1];
99+
for (vid_t i = 0; i < cntNodes; i++) {
100+
calculateNeighborhood(&neighbors, &oldNeighbors, i, nodes, DISTANCE);
101+
vid_t curIndex = sched->dependentEdgeIndex[i];
102+
for (const auto& nbr : neighbors) {
103+
if ((samePhase(nbr, i, sched->chunkdata))
104+
&& (interChunkDependency(i, nbr))) {
105+
sched->dependentEdges[curIndex++] = nbr;
106+
}
107+
}
108+
}
109+
}
110+
111+
static void createChunkData(vertex_t * const nodes, const vid_t cntNodes,
112+
scheddata_t * const scheddata) {
113+
scheddata->cntChunks = (cntNodes + (1 << CHUNK_BITS) - 1) >> CHUNK_BITS;
114+
scheddata->chunkdata = new (std::nothrow) chunkdata_t[scheddata->cntChunks];
115+
assert(scheddata->chunkdata != NULL);
116+
117+
cilk_for (vid_t i = 0; i < scheddata->cntChunks; ++i) {
118+
chunkdata_t * chunk = &scheddata->chunkdata[i];
119+
chunk->nextIndex = i << CHUNK_BITS;
120+
chunk->phaseEndIndex[0] = std::min(chunk->nextIndex + (1 << (CHUNK_BITS - 1)),
121+
cntNodes);
122+
chunk->phaseEndIndex[1] = std::min((i + 1) << CHUNK_BITS, cntNodes);
123+
// put code to greedily move boundaryIndex to minimize cost of
124+
// interChunk dependencies here
125+
}
126+
}
127+
128+
static void init_scheduling(vertex_t * const nodes, const vid_t cntNodes,
129+
scheddata_t * const scheddata) {
130+
createChunkData(nodes, cntNodes, scheddata);
131+
calculateNodeDependenciesChunk(nodes, cntNodes, scheddata);
132+
}
133+
134+
static void execute_round(const int round, vertex_t * const nodes,
135+
const vid_t cntNodes, scheddata_t * const scheddata) {
136+
WHEN_DEBUG({
137+
cout << "Running chunk round" << round << endl;
138+
})
139+
140+
for (vid_t i = 0; i < scheddata->cntChunks; i++) {
141+
scheddata->chunkdata[i].nextIndex = i << CHUNK_BITS;
142+
}
143+
144+
const int NUM_PHASES = 2;
145+
for (int phase = 0; phase < NUM_PHASES; phase++) {
146+
volatile bool doneFlag = false;
147+
while (!doneFlag) {
148+
doneFlag = true;
149+
cilk_for (vid_t i = 0; i < scheddata->cntChunks; i++) {
150+
chunkdata_t * chunk = &scheddata->chunkdata[i];
151+
vid_t j = chunk->nextIndex;
152+
bool localDoneFlag = false;
153+
while (!localDoneFlag && (j < chunk->phaseEndIndex[phase])) {
154+
if (nodes[j].satisfied == 0) {
155+
update(nodes, j);
156+
if (DISTANCE > 0) {
157+
nodes[j].satisfied = nodes[j].dependencies;
158+
vid_t edgeIndex = scheddata->dependentEdgeIndex[j];
159+
vid_t * edges = &scheddata->dependentEdges[edgeIndex];
160+
for (vid_t k = 0; k < scheddata->cntDependentEdges[j]; k++) {
161+
__sync_sub_and_fetch(&nodes[edges[k]].satisfied, 1);
162+
}
163+
}
164+
} else {
165+
scheddata->chunkdata[i].nextIndex = j;
166+
localDoneFlag = true; // we couldn't process one of the nodes, so break
167+
doneFlag = false; // we couldn't process one, so we need another round
168+
}
169+
j++;
170+
}
171+
if (!localDoneFlag) {
172+
scheddata->chunkdata[i].nextIndex = j;
173+
}
174+
}
175+
}
176+
}
177+
}
178+
179+
static void cleanup_scheduling(vertex_t * const nodes, const vid_t cntNodes,
180+
scheddata_t * const scheddata) {
181+
delete[] scheddata->chunkdata;
182+
delete[] scheddata->dependentEdges;
183+
delete[] scheddata->dependentEdgeIndex;
184+
delete[] scheddata->cntDependentEdges;
185+
}
186+
187+
static void print_execution_data() {
188+
cout << "Chunk size bits: " << CHUNK_BITS << '\n';
189+
}
190+
191+
#endif // D1_PHASE
192+
193+
#endif // PHASE_SCHEDULING_H_

0 commit comments

Comments
 (0)