data-graph-computations
diff --git a/‎CPPLINT.cfg
100644100755 b/‎CPPLINT.cfg
100644100755
diff --git a/‎Makefile
100644100755
Lines changed: 2 additions & 0 deletions b/‎Makefile
100644100755
Lines changed: 2 additions & 0 deletions
diff --git a/‎scripts/benchmark_hilbert.sh
100644100755
Lines changed: 52 additions & 29 deletions b/‎scripts/benchmark_hilbert.sh
100644100755
Lines changed: 52 additions & 29 deletions
diff --git a/‎scripts/benchmark_scalability.sh
100644100755 b/‎scripts/benchmark_scalability.sh
100644100755
diff --git a/‎scripts/distance_distribution.py
100644100755 b/‎scripts/distance_distribution.py
100644100755
diff --git a/‎src/graph_compute/Makefile
Lines changed: 9 additions & 1 deletion b/‎src/graph_compute/Makefile
Lines changed: 9 additions & 1 deletion
diff --git a/‎src/graph_compute/chunk_scheduling.h
Lines changed: 5 additions & 1 deletion b/‎src/graph_compute/chunk_scheduling.h
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/graph_compute/common.h
Lines changed: 8 additions & 0 deletions b/‎src/graph_compute/common.h
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/graph_compute/compute.cpp
Lines changed: 4 additions & 0 deletions b/‎src/graph_compute/compute.cpp
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/graph_compute/phase_scheduling.h
Lines changed: 193 additions & 0 deletions b/‎src/graph_compute/phase_scheduling.h
Lines changed: 193 additions & 0 deletions
@@ -17,6 +17,8 @@ BFS ?= 0
 D0_BSP ?= 0
 D1_PRIO ?= 0
 D1_CHUNK ?= 0
+D1_PHASE ?= 0
+DISTANCE ?= 1
 
 DIST_UNIFORM ?= 1
 
 
@@ -23,47 +23,70 @@ echo "  $originalnodes" >>$output
 echo "  $originaledges" >>$output
 echo "" >>$output
 
-rounds=10
+rounds=3
 
 # benchmark the unordered input, parallel and not
-parallel=0 ; while [[ $parallel -le 1 ]] ; do
-  (make TMP=$benchroot clean-graph-compute) 2>&1 >/dev/null ;
-  (make TMP=$benchroot BASELINE=1 PARALLEL=$parallel build-graph-compute) #2>&1 >/dev/null
-  echo ""
-  echo "Running original data, baseline, parallel=$parallel"
-  echo ""
-  make TMP=$benchroot ROUNDS=$rounds OUTPUT=$output ORIGINAL_NODES_FILE=$originalnodes ORIGINAL_EDGES_FILE=$originaledges run-original-concat ;
-  echo "" >>$output
-
-  (make TMP=$benchroot clean-graph-compute) 2>&1 >/dev/null ;
-  (make TMP=$benchroot D0_BSP=1 PARALLEL=$parallel build-graph-compute) #2>&1 >/dev/null
-  echo ""
-  echo "Running original data, d0-bsp, parallel=$parallel"
-  echo ""
-  make TMP=$benchroot ROUNDS=$rounds OUTPUT=$output ORIGINAL_NODES_FILE=$originalnodes ORIGINAL_EDGES_FILE=$originaledges run-original-concat ;
-  echo "" >>$output
-
-  (make TMP=$benchroot clean-graph-compute) 2>&1 >/dev/null ;
-  (make TMP=$benchroot D1_CHUNK=1 CHUNK_BITS=$chunkbits PARALLEL=$parallel build-graph-compute) #2>&1 >/dev/null
-  echo ""
-  echo "Running original data, chunk ($chunkbits bits), parallel=$parallel"
-  echo ""
-  make TMP=$benchroot ROUNDS=$rounds OUTPUT=$output ORIGINAL_NODES_FILE=$originalnodes ORIGINAL_EDGES_FILE=$originaledges run-original-concat ;
-  echo "" >>$output
-
-  ((parallel = $parallel + 1)) ;
-done ;
+# parallel=0 ; while [[ $parallel -le 1 ]] ; do
+#   distance=0 ; while [[ $distance -le 2 ]] ; do
+#     (make TMP=$benchroot clean-graph-compute) 2>&1 >/dev/null ;
+#     (make TMP=$benchroot D1_PHASE=1 DISTANCE=$distance CHUNK_BITS=$chunkbits PARALLEL=$parallel build-graph-compute) #2>&1 >/dev/null
+#     echo ""
+#     echo "Running original data, chunk ($chunkbits bits), parallel=$parallel"
+#     echo ""
+#     make TMP=$benchroot ROUNDS=$rounds OUTPUT=$output ORIGINAL_NODES_FILE=$originalnodes ORIGINAL_EDGES_FILE=$originaledges run-original-concat ;
+#     echo "" >>$output
+#     ((distance = $distance + 1)) ;
+#   done ;
+  
+#   (make TMP=$benchroot clean-graph-compute) 2>&1 >/dev/null ;
+#   (make TMP=$benchroot BASELINE=1 PARALLEL=$parallel build-graph-compute) #2>&1 >/dev/null
+#   echo ""
+#   echo "Running original data, baseline, parallel=$parallel"
+#   echo ""
+#   make TMP=$benchroot ROUNDS=$rounds OUTPUT=$output ORIGINAL_NODES_FILE=$originalnodes ORIGINAL_EDGES_FILE=$originaledges run-original-concat ;
+#   echo "" >>$output
+
+#   (make TMP=$benchroot clean-graph-compute) 2>&1 >/dev/null ;
+#   (make TMP=$benchroot D0_BSP=1 PARALLEL=$parallel build-graph-compute) #2>&1 >/dev/null
+#   echo ""
+#   echo "Running original data, d0-bsp, parallel=$parallel"
+#   echo ""
+#   make TMP=$benchroot ROUNDS=$rounds OUTPUT=$output ORIGINAL_NODES_FILE=$originalnodes ORIGINAL_EDGES_FILE=$originaledges run-original-concat ;
+#   echo "" >>$output
+
+#   (make TMP=$benchroot clean-graph-compute) 2>&1 >/dev/null ;
+#   (make TMP=$benchroot D1_CHUNK=1 CHUNK_BITS=$chunkbits PARALLEL=$parallel build-graph-compute) #2>&1 >/dev/null
+#   echo ""
+#   echo "Running original data, chunk ($chunkbits bits), parallel=$parallel"
+#   echo ""
+#   make TMP=$benchroot ROUNDS=$rounds OUTPUT=$output ORIGINAL_NODES_FILE=$originalnodes ORIGINAL_EDGES_FILE=$originaledges run-original-concat ;
+#   echo "" >>$output
+
+#   ((parallel = $parallel + 1)) ;
+# done ;
 
 # for each Hilbert granularity,
 # benchmark the baseline code, the fake BSP, and the best optimized one,
 # both in parallel and in series
-hilbert=1 ; while [[ $hilbert -le 9 ]] ; do
+hilbert=7 ; while [[ $hilbert -le 8 ]] ; do
   echo "Reordering with $hilbert Hilbert bits per dimension"
 
   (make TMP=$benchroot clean-hilbert-reorder) 2>&1 >/dev/null;
   make TMP=$benchroot PARALLEL=1 HILBERTBITS=$hilbert ORIGINAL_NODES_FILE=$originalnodes ORIGINAL_EDGES_FILE=$originaledges reorder-graph ;
 
   parallel=0 ; while [[ $parallel -le 1 ]] ; do
+    distance=0 ; while [[ $distance -le 2 ]] ; do
+      (make TMP=$benchroot clean-graph-compute) 2>&1 >/dev/null ;
+      (make TMP=$benchroot D1_PHASE=1 DISTANCE=$distance CHUNK_BITS=$chunkbits PARALLEL=$parallel build-graph-compute) #2>&1 >/dev/null
+      echo ""
+      echo "Running reordered (hilbert=$hilbert) data, chunk ($chunkbits bits), parallel=$parallel"
+      echo ""
+      make TMP=$benchroot ROUNDS=$rounds OUTPUT=$output run-reordered-concat ;
+      echo "Hilbert bits: $hilbert" >>$output;
+      echo "" >>$output
+      ((distance = $distance + 1)) ;
+    done ;
+
     (make TMP=$benchroot clean-graph-compute) 2>&1 >/dev/null ;
     (make TMP=$benchroot BASELINE=1 PARALLEL=$parallel build-graph-compute) #2>&1 >/dev/null
     echo ""
 
@@ -2,7 +2,7 @@ CC  ?= gcc
 CXX ?= g++
 CFLAGS = -O3 -Wall
 CXXFLAGS = -fcilkplus -std=c++11 -O3 -Wall -m64
-LDFLAGS = -lcilkrts -ldl -lrt
+LDFLAGS = -L/usr/lib64 -lcilkrts -ldl -lrt
 ROOT = ../../
 
 HEADERS = common.h io.h
@@ -32,10 +32,18 @@ ifneq ($(D1_CHUNK),)
 	DEFS += -DD1_CHUNK=$(D1_CHUNK)
 endif
 
+ifneq ($(D1_PHASE),)
+	DEFS += -DD1_PHASE=$(D1_PHASE)
+endif
+
 ifneq ($(CHUNK_BITS),)
 	DEFS += -DCHUNK_BITS=$(CHUNK_BITS)
 endif
 
+ifneq ($(DISTANCE),)
+	DEFS += -DDISTANCE=$(DISTANCE)
+endif
+
 ifneq ($(BASELINE),)
 	DEFS += -DBASELINE=$(BASELINE)
 endif
 
@@ -44,16 +44,20 @@ static inline bool chunkDependency(vid_t v, vid_t w) {
 
 static void calculateNodeDependenciesChunk(vertex_t * const nodes,
                                            const vid_t cntNodes) {
-  cilk_for (vid_t i = 0; i < cntNodes; i++) {
+  vid_t cntDependencies = 0;
+  for (vid_t i = 0; i < cntNodes; i++) {
     vertex_t * node = &nodes[i];
     node->dependencies = 0;
     for (vid_t j = 0; j < node->cntEdges; j++) {
       if (interChunkDependency(node->edges[j], i)) {
         ++node->dependencies;
+        cntDependencies++;
       }
     }
     node->satisfied = node->dependencies;
   }
+  printf("InterChunkDependencies: %lu\n",
+    static_cast<uint64_t>(cntDependencies));
 }
 
 // for each node, move inter-chunk successors to the front of the edges list
 
@@ -54,10 +54,18 @@
   #define D1_CHUNK 0
 #endif
 
+#ifndef D1_PHASE
+  #define D1_PHASE 0
+#endif
+
 #ifndef PARALLEL
   #define PARALLEL 0
 #endif
 
+#ifndef DISTANCE
+  #define DISTANCE 1
+#endif
+
 #if PARALLEL
   #include <cilk/cilk.h>
   #include <cilk/cilk_api.h>
 
@@ -19,6 +19,8 @@ using namespace std;
   #include "./priority_scheduling.h"
 #elif D1_CHUNK
   #include "./chunk_scheduling.h"
+#elif D1_PHASE
+  #include "./phase_scheduling.h"
 #elif D0_BSP
   #include "./bsp_scheduling.h"
 #else
@@ -126,7 +128,9 @@ int main(int argc, char *argv[]) {
   cout << "D0_BSP: " << D0_BSP << '\n';
   cout << "D1_PRIO: " << D1_PRIO << '\n';
   cout << "D1_CHUNK: " << D1_CHUNK << '\n';
+  cout << "D1_PHASE: " << D1_PHASE << '\n';
   cout << "Parallel: " << PARALLEL << '\n';
+  cout << "Distance: " << DISTANCE << '\n';
 
   print_execution_data();
 
 
@@ -0,0 +1,193 @@
+#ifndef PHASE_SCHEDULING_H_
+#define PHASE_SCHEDULING_H_
+
+#if D1_PHASE
+
+#include <algorithm>
+#include <unordered_set>
+#include "./common.h"
+#include "./update_function.h"
+
+#ifndef CHUNK_BITS
+  #define CHUNK_BITS 16
+#endif
+
+struct chunkdata_t {
+  vid_t nextIndex;  // the next vertex in this chunk to be processed
+  vid_t phaseEndIndex[2];   // the index of the first vertex beyond this chunk
+};
+typedef struct chunkdata_t chunkdata_t;
+
+struct scheddata_t {
+  vid_t * dependentEdges;
+  vid_t * dependentEdgeIndex;
+  vid_t * cntDependentEdges;
+  chunkdata_t * chunkdata;
+  vid_t cntChunks;
+};
+typedef struct scheddata_t scheddata_t;
+
+static inline bool samePhase(vid_t v, vid_t w, chunkdata_t * const chunkdata) {
+  // this assumes that the boundary between phases is the
+  // midpoint of the chunk - we will generalize later
+  // to a custom boundary to optimize overall work
+  bool vPhase = v < chunkdata[v >> CHUNK_BITS].phaseEndIndex[0];
+  bool wPhase = w < chunkdata[w >> CHUNK_BITS].phaseEndIndex[0];
+  return (vPhase == wPhase);
+}
+
+static inline bool interChunkDependency(vid_t v, vid_t w) {
+  static const vid_t chunkMask = (1 << CHUNK_BITS) - 1;
+  if ((v >> CHUNK_BITS) == (w >> CHUNK_BITS)) {
+    return false;
+  } else {
+    return ((v & chunkMask) < (w & chunkMask));
+  }
+}
+
+static void calculateNeighborhood(std::unordered_set<vid_t> * neighbors,
+                                  std::unordered_set<vid_t> * oldNeighbors,
+                                  vid_t v,
+                                  vertex_t * const nodes, vid_t distance) {
+  neighbors->clear();
+  neighbors->insert(v);
+  oldNeighbors->clear();
+  for (vid_t d = 0; d < distance; d++) {
+    *oldNeighbors = *neighbors;
+    for (const auto& v : *oldNeighbors) {
+      for (vid_t j = 0; j < nodes[v].cntEdges; j++) {
+        if (oldNeighbors->count(nodes[v].edges[j]) == 0) {
+          neighbors->insert(nodes[v].edges[j]);
+        }
+      }
+    }
+  }
+}
+
+static void calculateNodeDependenciesChunk(vertex_t * const nodes,
+                                           const vid_t cntNodes,
+                                           scheddata_t * const sched) {
+  sched->dependentEdgeIndex = new (std::nothrow) vid_t[cntNodes];
+  sched->cntDependentEdges = new (std::nothrow) vid_t[cntNodes];
+  vid_t cntDependencies = 0;
+  std::unordered_set<vid_t> neighbors;
+  neighbors.reserve(1024);
+  std::unordered_set<vid_t> oldNeighbors;
+  oldNeighbors.reserve(1024);
+  for (vid_t i = 0; i < cntNodes; i++) {
+    calculateNeighborhood(&neighbors, &oldNeighbors, i, nodes, DISTANCE);
+    sched->dependentEdgeIndex[i] = cntDependencies;
+    vertex_t * node = &nodes[i];
+    node->dependencies = 0;
+    vid_t outDep = 0;
+    for (const auto& nbr : neighbors) {
+      if (samePhase(nbr, i, sched->chunkdata)) {
+        if (interChunkDependency(nbr, i)) {
+          ++node->dependencies;
+        } else if (interChunkDependency(i, nbr)) {
+          cntDependencies++;
+          outDep++;
+        }
+      }
+    }
+    node->satisfied = node->dependencies;
+    sched->cntDependentEdges[i] = outDep;
+  }
+  printf("InterChunkDependencies: %lu\n",
+    static_cast<uint64_t>(cntDependencies));
+  sched->dependentEdges = new (std::nothrow) vid_t[cntDependencies+1];
+  for (vid_t i = 0; i < cntNodes; i++) {
+    calculateNeighborhood(&neighbors, &oldNeighbors, i, nodes, DISTANCE);
+    vid_t curIndex = sched->dependentEdgeIndex[i];
+    for (const auto& nbr : neighbors) {
+      if ((samePhase(nbr, i, sched->chunkdata))
+        && (interChunkDependency(i, nbr))) {
+          sched->dependentEdges[curIndex++] = nbr;
+      }
+    }
+  }
+}
+
+static void createChunkData(vertex_t * const nodes, const vid_t cntNodes,
+                            scheddata_t * const scheddata) {
+  scheddata->cntChunks = (cntNodes + (1 << CHUNK_BITS) - 1) >> CHUNK_BITS;
+  scheddata->chunkdata = new (std::nothrow) chunkdata_t[scheddata->cntChunks];
+  assert(scheddata->chunkdata != NULL);
+
+  cilk_for (vid_t i = 0; i < scheddata->cntChunks; ++i) {
+    chunkdata_t * chunk = &scheddata->chunkdata[i];
+    chunk->nextIndex = i << CHUNK_BITS;
+    chunk->phaseEndIndex[0] = std::min(chunk->nextIndex + (1 << (CHUNK_BITS - 1)),
+      cntNodes);
+    chunk->phaseEndIndex[1] = std::min((i + 1) << CHUNK_BITS, cntNodes);
+    // put code to greedily move boundaryIndex to minimize cost of
+    // interChunk dependencies here
+  }
+}
+
+static void init_scheduling(vertex_t * const nodes, const vid_t cntNodes,
+                            scheddata_t * const scheddata) {
+  createChunkData(nodes, cntNodes, scheddata);
+  calculateNodeDependenciesChunk(nodes, cntNodes, scheddata);
+}
+
+static void execute_round(const int round, vertex_t * const nodes,
+                          const vid_t cntNodes, scheddata_t * const scheddata) {
+  WHEN_DEBUG({
+    cout << "Running chunk round" << round << endl;
+  })
+
+  for (vid_t i = 0; i < scheddata->cntChunks; i++) {
+    scheddata->chunkdata[i].nextIndex = i << CHUNK_BITS;
+  }
+
+  const int NUM_PHASES = 2;
+  for (int phase = 0; phase < NUM_PHASES; phase++) {
+    volatile bool doneFlag = false;
+    while (!doneFlag) {
+      doneFlag = true;
+      cilk_for (vid_t i = 0; i < scheddata->cntChunks; i++) {
+        chunkdata_t * chunk = &scheddata->chunkdata[i];
+        vid_t j = chunk->nextIndex;
+        bool localDoneFlag = false;
+        while (!localDoneFlag && (j < chunk->phaseEndIndex[phase])) {
+          if (nodes[j].satisfied == 0) {
+            update(nodes, j);
+            if (DISTANCE > 0) {
+              nodes[j].satisfied = nodes[j].dependencies;
+              vid_t edgeIndex = scheddata->dependentEdgeIndex[j];
+              vid_t * edges = &scheddata->dependentEdges[edgeIndex];
+              for (vid_t k = 0; k < scheddata->cntDependentEdges[j]; k++) {
+                __sync_sub_and_fetch(&nodes[edges[k]].satisfied, 1);
+              }
+            }
+          } else {
+            scheddata->chunkdata[i].nextIndex = j;
+            localDoneFlag = true;  // we couldn't process one of the nodes, so break
+            doneFlag = false;  // we couldn't process one, so we need another round
+          }
+          j++;
+        }
+        if (!localDoneFlag) {
+          scheddata->chunkdata[i].nextIndex = j;
+        }
+      }
+    }
+  }
+}
+
+static void cleanup_scheduling(vertex_t * const nodes, const vid_t cntNodes,
+                               scheddata_t * const scheddata) {
+  delete[] scheddata->chunkdata;
+  delete[] scheddata->dependentEdges;
+  delete[] scheddata->dependentEdgeIndex;
+  delete[] scheddata->cntDependentEdges;
+}
+
+static void print_execution_data() {
+  cout << "Chunk size bits: " << CHUNK_BITS << '\n';
+}
+
+#endif  // D1_PHASE
+
+#endif  // PHASE_SCHEDULING_H_
Original file line number	Diff line number	Diff line change
`@@ -44,16 +44,20 @@ static inline bool chunkDependency(vid_t v, vid_t w) {`
`44`	`44`
`45`	`45`	`static void calculateNodeDependenciesChunk(vertex_t * const nodes,`
`46`	`46`	`const vid_t cntNodes) {`
`47`		`- cilk_for (vid_t i = 0; i < cntNodes; i++) {`
	`47`	`+ vid_t cntDependencies = 0;`
	`48`	`+ for (vid_t i = 0; i < cntNodes; i++) {`
`48`	`49`	`vertex_t * node = &nodes[i];`
`49`	`50`	`node->dependencies = 0;`
`50`	`51`	`for (vid_t j = 0; j < node->cntEdges; j++) {`
`51`	`52`	`if (interChunkDependency(node->edges[j], i)) {`
`52`	`53`	`++node->dependencies;`
	`54`	`+ cntDependencies++;`
`53`	`55`	`}`
`54`	`56`	`}`
`55`	`57`	`node->satisfied = node->dependencies;`
`56`	`58`	`}`
	`59`	`+ printf("InterChunkDependencies: %lu\n",`
	`60`	`+ static_cast<uint64_t>(cntDependencies));`
`57`	`61`	`}`
`58`	`62`
`59`	`63`	`// for each node, move inter-chunk successors to the front of the edges list`