From 6f7e8a146594375ee022142fc52aa694e31e9398 Mon Sep 17 00:00:00 2001 From: Brian Wheatman Date: Fri, 12 Aug 2022 08:43:23 -0400 Subject: [PATCH] compiling with opencilk --- Makefile | 62 ++++++++++++++++++++++++++++++---------------- Semirings.h | 15 +++++------ aligned.h | 9 ++++--- bicsb.cpp | 13 +++++----- bmcsb.h | 5 ++-- both_test.cpp | 2 +- csb_spmv_test.cpp | 7 +++++- csb_spmvt_test.cpp | 2 +- csc.cpp | 4 +-- csc.h | 2 +- friends.h | 9 ++++--- spmm_test.cpp | 4 +-- spvec.cpp | 2 +- sym_spmv_test.cpp | 16 +++++++----- utility.h | 26 ++++++++++++++----- 15 files changed, 113 insertions(+), 65 deletions(-) diff --git a/Makefile b/Makefile index 48a22f3..c6c01eb 100644 --- a/Makefile +++ b/Makefile @@ -1,55 +1,75 @@ -CILK = /opt/intel/composer_xe_2013.5.198/compiler -INCADD = -I$(CILK)/include -I$(CILK)/examples/include -LIBADD = -L$(CILK)/lib/intel64 +CILK?=0 +NATIVE?=1 +OPT?=3 +SANITIZE?=0 + +CFLAGS := -Wall -Wextra -O$(OPT) -g -std=c++20 -gdwarf-4 -fno-exceptions -Wno-unknown-pragmas -Wno-comment + +ifeq ($(NATIVE),1) +CFLAGS += -march=native +endif + +ifeq ($(CILK),1) +CFLAGS += -fopencilk +endif + +ifeq ($(SANITIZE),1) +ifeq ($(CILK),1) +CFLAGS += -fsanitize=cilk,undefined,address -fno-omit-frame-pointer +else +CFLAGS += -fsanitize=undefined,address -fno-omit-frame-pointer +endif +endif + +DEFINES := -DCILK=$(CILK) + +all: parspmv both_d spmm_dall spmm_a spmm_sall -GCCOPT = -O2 -fno-rtti -fno-exceptions # -ftree-vectorize -INTELOPT = -O2 -no-ipo -fno-rtti -fno-exceptions -parallel -restrict -std=c++11 -xAVX -no-prec-div #-fno-inline-functions -DEB = -g -DNOBM -O0 -parallel -restrict -std=c++11 seqsym: sym_spmv_test.cpp csbsym.cpp csbsym.h utility.h friends.h SSEspmv.o - icpc -cilk-serialize $(INCADD) $(INTELOPT) -o seqsym sym_spmv_test.cpp SSEspmv.o + $(CXX) $(CFLAGS) $(DEFINES) -o seqsym sym_spmv_test.cpp SSEspmv.o parsym: sym_spmv_test.cpp csbsym.cpp csbsym.h utility.h friends.h SSEspmv.o - icpc $(INCADD) $(DEB) -o parsym sym_spmv_test.cpp SSEspmv.o + $(CXX) $(CFLAGS) $(DEFINES) -o parsym sym_spmv_test.cpp SSEspmv.o symanal: sym_spmv_test.cpp csbsym.cpp csbsym.h utility.h friends.h SSEspmv.o - icpc -DSTATS $(INCADD) $(INTELOPT) -o symanal sym_spmv_test.cpp SSEspmv.o -lcilkutil + $(CXX) $(CFLAGS) $(DEFINES) -o symanal sym_spmv_test.cpp SSEspmv.o seqspmv: csb_spmv_test.cpp bicsb.cpp bicsb.h bmcsb.cpp bmcsb.h friends.h utility.h SSEspmv.o - icpc -cilk-serialize $(INCADD) $(INTELOPT) -o seqspmv csb_spmv_test.cpp SSEspmv.o + $(CXX) $(CFLAGS) $(DEFINES) -o seqspmv csb_spmv_test.cpp SSEspmv.o parspmv: csb_spmv_test.cpp bicsb.cpp bicsb.h bmcsb.cpp bmcsb.h friends.h utility.h SSEspmv.o - icpc $(INCADD) $(INTELOPT) -o parspmv csb_spmv_test.cpp SSEspmv.o + $(CXX) $(CFLAGS) $(DEFINES) -o parspmv csb_spmv_test.cpp SSEspmv.o parspmv_nobm: csb_spmv_test.cpp bicsb.cpp bicsb.h friends.h utility.h - icpc $(INCADD) $(INTELOPT) -DNOBM -o parspmv_nobm csb_spmv_test.cpp + $(CXX) $(CFLAGS) $(DEFINES) -DNOBM -o parspmv_nobm csb_spmv_test.cpp parspmvt: csb_spmvt_test.cpp bicsb.cpp bicsb.h utility.h friends.h - icpc $(INCADD) $(INTELOPT) -o parspmvt csb_spmvt_test.cpp + $(CXX) $(CFLAGS) $(DEFINES) -o parspmvt csb_spmvt_test.cpp both_d: both_test.cpp bicsb.cpp bicsb.h utility.h friends.h - icpc $(INCADD) $(INTELOPT) -o both_d both_test.cpp + $(CXX) $(CFLAGS) $(DEFINES) -o both_d both_test.cpp both_s: both_test.cpp bicsb.cpp bicsb.h utility.h friends.h - icpc $(INCADD) $(INTELOPT) -DSINGLEPRECISION -o both_s both_test.cpp + $(CXX) $(CFLAGS) $(DEFINES) -DSINGLEPRECISION -o both_s both_test.cpp spmm_dall: spmm_test.cpp bicsb.cpp bicsb.h utility.h friends.h for number in 4 8 12 16 24 32 40 48 56 64; do \ - echo "icpc $(INCADD) $(INTELOPT) -DRHSDIM=$$number -o spmm_d$$number spmm_test.cpp"; \ - icpc $(INCADD) $(INTELOPT) -DRHSDIM=$$number -o spmm_d$$number spmm_test.cpp; \ + echo "$(CXX) $(CFLAGS) $(DEFINES) -DRHSDIM=$$number -o spmm_d$$number spmm_test.cpp"; \ + $(CXX) $(CFLAGS) $(DEFINES) -DRHSDIM=$$number -o spmm_d$$number spmm_test.cpp; \ done; spmm_a: spmm_test.cpp bicsb.cpp bicsb.h utility.h friends.h - icpc $(INCADD) $(INTELOPT) -DSINGLEPRECISION -S -fcode-asm -vec_report6 spmm_test.cpp + $(CXX) $(CFLAGS) $(DEFINES) -DSINGLEPRECISION -S -fcode-asm -vec_report6 spmm_test.cpp spmm_sall: spmm_test.cpp bicsb.cpp bicsb.h utility.h friends.h for number in 4 8 12 16 24 32 40 48 56 64; do \ - echo "icpc $(INCADD) $(INTELOPT) -DSINGLEPRECISION -DRHSDIM=$$number -o spmm_s$$number spmm_test.cpp"; \ - icpc $(INCADD) $(INTELOPT) -DSINGLEPRECISION -DRHSDIM=$$number -o spmm_s$$number spmm_test.cpp; \ + echo "$(CXX) $(CFLAGS) $(DEFINES) -DSINGLEPRECISION -DRHSDIM=$$number -o spmm_s$$number spmm_test.cpp"; \ + $(CXX) $(CFLAGS) $(DEFINES) -DSINGLEPRECISION -DRHSDIM=$$number -o spmm_s$$number spmm_test.cpp; \ done; SSEspmv.o: SSEspmv.cpp - g++ -DAMD $(GCCOPT) -march=amdfam10 -c SSEspmv.cpp + $(CXX) $(CFLAGS) $(DEFINES) -c SSEspmv.cpp clean: rm -f seqspmv diff --git a/Semirings.h b/Semirings.h index 1deca9c..62563d1 100644 --- a/Semirings.h +++ b/Semirings.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "promote.h" template @@ -60,7 +61,7 @@ struct UnrollerL { template struct UnrollerL { template - static void step(Lambda& func) { + [[maybe_unused]] static void step([[maybe_unused]] Lambda& func) { // base case is when Begin=End; do nothing } }; @@ -75,13 +76,13 @@ struct PTSRArray // y <- a*x + y overload with a=1 static void axpy(const array & b, array & c) { + // const T2 * __restrict barr = std::assume_aligned(b.data()); + // T_promote * __restrict carr = std::assume_aligned(c.data()); const T2 * __restrict barr = b.data(); T_promote * __restrict carr = c.data(); - __assume_aligned(barr, ALIGN); - __assume_aligned(carr, ALIGN); #pragma simd - for(int i=0; i 32 static void axpy(T1 a, const array & b, array & c) { + // const T2 * __restrict barr = std::assume_aligned(b.data()); + // T_promote * __restrict carr = std::assume_aligned(c.data()); const T2 * __restrict barr = b.data(); T_promote * __restrict carr = c.data(); - __assume_aligned(barr, ALIGN); - __assume_aligned(carr, ALIGN); #pragma simd - for(int i=0; i #endif #include +#include #include #include using namespace std; @@ -71,7 +72,7 @@ class aligned_allocator // Returns true if and only if storage allocated from *this // can be deallocated from other, and vice versa. // Always returns true for stateless allocators. - bool operator==(const aligned_allocator& other) const + bool operator==([[maybe_unused]] const aligned_allocator& other) const { return true; } @@ -110,7 +111,7 @@ class aligned_allocator } // Mallocator wraps malloc(). - void * const pv = _mm_malloc(n * sizeof(T), Alignment); + void * const pv = std::aligned_alloc(Alignment, n * sizeof(T)); // Allocators should throw std::bad_alloc in the case of memory allocation failure. if (pv == NULL) @@ -121,9 +122,9 @@ class aligned_allocator return static_cast(pv); } - void deallocate(T * const p, const std::size_t n) const + void deallocate(T * const p, [[maybe_unused]] const std::size_t n) const { - _mm_free(p); + free(p); } diff --git a/bicsb.cpp b/bicsb.cpp index 974b50e..6542cd9 100644 --- a/bicsb.cpp +++ b/bicsb.cpp @@ -20,8 +20,8 @@ void BiCsb::Init(int workers, IT forcelogbeta) bool sizereq; if (ispar) { - sizereq = ((IntPower<2>(rowbits) > SLACKNESS * workers) - && (IntPower<2>(colbits) > SLACKNESS * workers)); + sizereq = ((IntPower<2>(rowbits) > (unsigned int) SLACKNESS * workers) + && (IntPower<2>(colbits) > (unsigned int) SLACKNESS * workers)); } else { @@ -43,7 +43,7 @@ void BiCsb::Init(int workers, IT forcelogbeta) colhighbits = colbits-collowbits; // # higher order bits for cols (has at least one bit) if(ispar) { - while(IntPower<2>(rowhighbits) < SLACKNESS * workers) + while(IntPower<2>(rowhighbits) < (unsigned int) SLACKNESS * workers) { rowhighbits++; rowlowbits--; @@ -869,8 +869,8 @@ void BiCsb::SubSpMV(IT * __restrict btop, IT bstart, IT bend, const RHS IT * __restrict r_bot = bot; NT * __restrict r_num = num; - __m128i lcms = _mm_set1_epi32 (lowcolmask); - __m128i lrms = _mm_set1_epi32 (lowrowmask); + [[maybe_unused]] __m128i lcms = _mm_set1_epi32 (lowcolmask); + [[maybe_unused]] __m128i lrms = _mm_set1_epi32 (lowrowmask); for (IT j = bstart ; j < bend ; ++j) // for all blocks inside that block row { @@ -1350,8 +1350,9 @@ ofstream & BiCsb::PrintStats(ofstream & outfile) const outfile << "## Number of real blocks is "<< ntop << endl; outfile << "## Row imbalance is " << RowImbalance(*this) << endl; outfile << "## Col imbalance is " << ColImbalance(*this) << endl; + #ifdef STATS outfile << "## Block parallel calls is " << blockparcalls.get_value() << endl; - + #endif std::vector blocksizes(ntop); for(IT i=0; i mortoncmp; // comparison operator w.r.t. the (inverted N)-morton layout diff --git a/both_test.cpp b/both_test.cpp index 96de200..057dc00 100644 --- a/both_test.cpp +++ b/both_test.cpp @@ -31,7 +31,7 @@ using namespace std; int main(int argc, char* argv[]) { -#ifndef CILK_STUB +#if CILK==1 int gl_nworkers = __cilkrts_get_nworkers(); #else int gl_nworkers = 0; diff --git a/csb_spmv_test.cpp b/csb_spmv_test.cpp index d73792f..4677dd5 100644 --- a/csb_spmv_test.cpp +++ b/csb_spmv_test.cpp @@ -12,6 +12,11 @@ #include "cilk_util.h" #include "utility.h" +#ifndef RHSDIM + #define RHSDIM 16 +#endif +#define ALIGN 32 + #include "triple.h" #include "csc.h" #include "bicsb.h" @@ -32,7 +37,7 @@ using namespace std; int main(int argc, char* argv[]) { -#ifndef CILK_STUB +#if CILK==1 int gl_nworkers = __cilkrts_get_nworkers(); #else int gl_nworkers = 0; diff --git a/csb_spmvt_test.cpp b/csb_spmvt_test.cpp index 8cd4b36..fc9d348 100644 --- a/csb_spmvt_test.cpp +++ b/csb_spmvt_test.cpp @@ -30,7 +30,7 @@ INDEXTYPE flops; int main(int argc, char* argv[]) { -#ifndef CILK_STUB +#if CILK==1 int gl_nworkers = __cilkrts_get_nworkers(); #else int gl_nworkers = 0; diff --git a/csc.cpp b/csc.cpp index b2cd265..878f05b 100644 --- a/csc.cpp +++ b/csc.cpp @@ -98,7 +98,7 @@ Csc::~Csc() // (a) triples only contain the upper triangular part, or (b) the whole matrix template Csc::Csc(Triple * triples, ITYPE size, ITYPE rows, ITYPE cols, bool isSym) -:nz(size),m(rows),n(cols),issym(isSym) +:issym(isSym), nz(size),m(rows),n(cols) { // Constructing empty Csc objects (size = 0) are not allowed. assert(size != 0 && n != 0); @@ -174,7 +174,7 @@ Csc::Csc(Triple * triples, ITYPE size, ITYPE rows, ITYPE cols // Construct a Csc object from parallel arrays template Csc::Csc(ITYPE * ri, ITYPE * ci, T * val, ITYPE size, ITYPE rows, ITYPE cols, bool isSym) -:nz(size),m(rows),n(cols),issym(isSym) +:issym(isSym),nz(size),m(rows),n(cols) { // Constructing empty Csc objects (size = 0) are not allowed. assert(size != 0 && n != 0); diff --git a/csc.h b/csc.h index bce605d..5452afe 100644 --- a/csc.h +++ b/csc.h @@ -15,7 +15,7 @@ template class Csc { public: - Csc ():nz(0), m(0), n(0), logicalnz(0), issym(false) {} // default constructor + Csc (): issym(false), logicalnz(0), nz(0), m(0), n(0) {} // default constructor Csc (ITYPE size,ITYPE rows, ITYPE cols, bool isSym=false); Csc (const Csc & rhs); // copy constructor ~Csc(); diff --git a/friends.h b/friends.h index 4bdbe6a..1d99710 100644 --- a/friends.h +++ b/friends.h @@ -35,7 +35,7 @@ void bmcsb_gespmv (const BmCsb & A, const NT * __restrict x, NT * double t0 = timer_seconds_since_init(); unsigned * scansum = new unsigned[A.nrb]; - unsigned sum = prescan(scansum, A.masks, A.nrb); + [[maybe_unused]] unsigned sum = prescan(scansum, A.masks, A.nrb); double t1 = timer_seconds_since_init(); prescantime += (t1-t0); @@ -128,14 +128,15 @@ void bicsb_gespmv (const BiCsb & A, const RHS * __restrict x, LHS * __re IT thsh = BREAKEVEN * ysize; vector chunks; chunks.push_back(btop); - for(IT j =0; j < A.nbc; ) + for(IT j =0; j < A.nbc-1; ) { IT count = btop[j+1] - btop[j]; if(count < thsh && j < A.nbc) { - while(count < thsh && j < A.nbc) + while(count < thsh && j < A.nbc-1) { - count += btop[(++j)+1] - btop[j]; + j+=1; + count += btop[j+1] - btop[j]; } chunks.push_back(btop+j); // push, but exclude the block that caused the overflow } diff --git a/spmm_test.cpp b/spmm_test.cpp index 75f1a31..a4f847d 100644 --- a/spmm_test.cpp +++ b/spmm_test.cpp @@ -105,12 +105,12 @@ void VerifyMM (vector< array, ALLOC > & control, vector< array, int main(int argc, char* argv[]) { -#ifndef CILK_STUB +#if CILK==1 int gl_nworkers = __cilkrts_get_nworkers(); #else int gl_nworkers = 0; #endif - bool syminput = false; + [[maybe_unused]] bool syminput = false; bool binary = false; bool iscsc = false; INDEXTYPE m = 0, n = 0, nnz = 0, forcelogbeta = 0; diff --git a/spvec.cpp b/spvec.cpp index 17f3bc4..1b2519b 100644 --- a/spvec.cpp +++ b/spvec.cpp @@ -141,7 +141,7 @@ void Spvec::fillzero() } template -void Verify(Spvec & control, Spvec & test, string name, IT m) +void Verify(Spvec & control, Spvec & test, [[maybe_unused]] string name, IT m) { vectorerror(m); std::transform(&control[0], (&control[0])+m, &test[0], error.begin(), absdiff()); diff --git a/sym_spmv_test.cpp b/sym_spmv_test.cpp index 5e6dfb6..4428f06 100644 --- a/sym_spmv_test.cpp +++ b/sym_spmv_test.cpp @@ -10,7 +10,11 @@ #include "utility" #include "timer.gettimeofday.c" -#include "cilk_util.h" + +#ifndef RHSDIM + #define RHSDIM 16 +#endif +#define ALIGN 32 #include "triple.h" #include "csc.h" @@ -33,7 +37,7 @@ using namespace std; int main(int argc, char* argv[]) { -#ifndef CILK_STUB +#if CILK==1 int gl_nworkers = WORKERS; #else int gl_nworkers = 0; @@ -110,7 +114,7 @@ int main(int argc, char* argv[]) return 1; } - long tstart = cilk_get_time(); // start timer + long tstart = get_time(); // start timer cout << "Reading matrix with dimensions: "<< m << "-by-" << n <<" having "<< nnz << " nonzeros" << endl; INDEXTYPE * rowindices = new INDEXTYPE[nnz]; @@ -127,7 +131,7 @@ int main(int argc, char* argv[]) return -1; } - long tend = cilk_get_time(); // end timer + long tend = get_time(); // end timer cout<< "Reading matrix in binary took " << ((VALUETYPE) (tend-tstart)) /1000 << " seconds" <> m >> n >> nnz; // #{rows}-#{cols}-#{nonzeros} - long tstart = cilk_get_time(); // start timer + long tstart = get_time(); // start timer Triple * triples = new Triple[nnz]; if (infile.is_open()) @@ -167,7 +171,7 @@ int main(int argc, char* argv[]) } assert(cnz == nnz); } - long tend = cilk_get_time(); // end timer + long tend = get_time(); // end timer cout<< "Reading matrix in ascii took " << ((double) (tend-tstart)) /1000 << " seconds" < #include #define SYNCHED __cilkrts_synched() #define DETECT __cilkscreen_enable_checking() #define ENDDETECT __cilkscreen_disable_checking() #define WORKERS __cilkrts_get_nworkers() +#else +#define cilk_for for +#define cilk_sync +#define cilk_spawn +#define SYNCHED (true) +#define WORKERS (1) +#endif + #ifdef BWTEST #define UNROLL 100 @@ -29,7 +38,7 @@ using namespace std; #define UNROLL 1 #endif -#ifndef CILK_STUB +#if CILK==1 #ifdef __cplusplus extern "C" { #endif @@ -41,16 +50,15 @@ extern "C" { * full frame to determine this. */ -CILK_EXPORT __CILKRTS_NOTHROW -int __cilkrts_synched(void); +#define __cilkrts_synched() (0) #ifdef __cplusplus } // extern "C" #endif -#else /* CILK_STUB */ +#else /* CILK==1 */ /* Stubs for the api functions */ #define __cilkrts_synched() (1) -#endif /* CILK_STUB */ +#endif /* CILK */ #ifdef STATS #include @@ -96,7 +104,7 @@ const unsigned char masktable4[4] = { 0x08, 0x04, 0x02, 0x01 }; // mask for 2x2 template -MTYPE GetMaskTable(unsigned int index) +MTYPE GetMaskTable([[maybe_unused]] unsigned int index) { return 0; } @@ -503,5 +511,11 @@ inline unsigned int getDivident(unsigned int n, unsigned int d) return n; } +[[maybe_unused]] static long get_time() { + struct timeval st; + gettimeofday(&st, NULL); + return st.tv_sec * 1000000 + st.tv_usec; +} + #endif