PASSIONLab · wheatman · Aug 12, 2022
diff --git a/Makefile b/Makefile
@@ -1,55 +1,75 @@
-CILK = /opt/intel/composer_xe_2013.5.198/compiler
-INCADD = -I$(CILK)/include -I$(CILK)/examples/include
-LIBADD = -L$(CILK)/lib/intel64
+CILK?=0
+NATIVE?=1
+OPT?=3
+SANITIZE?=0
+
+CFLAGS := -Wall -Wextra -O$(OPT) -g  -std=c++20 -gdwarf-4 -fno-exceptions -Wno-unknown-pragmas -Wno-comment
+
+ifeq ($(NATIVE),1)
+CFLAGS += -march=native
+endif
+
+ifeq ($(CILK),1)
+CFLAGS += -fopencilk
+endif
+
+ifeq ($(SANITIZE),1)
+ifeq ($(CILK),1)
+CFLAGS += -fsanitize=cilk,undefined,address -fno-omit-frame-pointer
+else
+CFLAGS += -fsanitize=undefined,address -fno-omit-frame-pointer
+endif
+endif
+
+DEFINES := -DCILK=$(CILK)
+
+all: parspmv both_d spmm_dall spmm_a spmm_sall
 
-GCCOPT = -O2 -fno-rtti -fno-exceptions # -ftree-vectorize
-INTELOPT = -O2 -no-ipo -fno-rtti -fno-exceptions -parallel -restrict -std=c++11 -xAVX -no-prec-div #-fno-inline-functions
-DEB = -g -DNOBM -O0 -parallel -restrict -std=c++11 
 
 seqsym: sym_spmv_test.cpp csbsym.cpp csbsym.h utility.h friends.h SSEspmv.o
-	icpc -cilk-serialize $(INCADD) $(INTELOPT) -o seqsym sym_spmv_test.cpp SSEspmv.o
+	$(CXX) $(CFLAGS) $(DEFINES) -o seqsym sym_spmv_test.cpp SSEspmv.o
 
 parsym: sym_spmv_test.cpp csbsym.cpp csbsym.h utility.h friends.h SSEspmv.o
-	icpc $(INCADD) $(DEB) -o parsym sym_spmv_test.cpp SSEspmv.o 
+	$(CXX) $(CFLAGS) $(DEFINES) -o parsym sym_spmv_test.cpp SSEspmv.o 
 
 symanal: sym_spmv_test.cpp csbsym.cpp csbsym.h utility.h friends.h SSEspmv.o
-	icpc -DSTATS $(INCADD) $(INTELOPT) -o symanal sym_spmv_test.cpp SSEspmv.o -lcilkutil
+	$(CXX) $(CFLAGS) $(DEFINES) -o symanal sym_spmv_test.cpp SSEspmv.o
 
 seqspmv: csb_spmv_test.cpp bicsb.cpp bicsb.h bmcsb.cpp bmcsb.h friends.h utility.h SSEspmv.o
-	icpc -cilk-serialize $(INCADD) $(INTELOPT) -o seqspmv csb_spmv_test.cpp SSEspmv.o
+	$(CXX) $(CFLAGS) $(DEFINES) -o seqspmv csb_spmv_test.cpp SSEspmv.o
 
 parspmv: csb_spmv_test.cpp bicsb.cpp bicsb.h bmcsb.cpp bmcsb.h friends.h utility.h SSEspmv.o 
-	icpc $(INCADD) $(INTELOPT) -o parspmv csb_spmv_test.cpp SSEspmv.o
+	$(CXX) $(CFLAGS) $(DEFINES) -o parspmv csb_spmv_test.cpp SSEspmv.o
 
 parspmv_nobm: csb_spmv_test.cpp bicsb.cpp bicsb.h friends.h utility.h
-	icpc $(INCADD) $(INTELOPT) -DNOBM -o parspmv_nobm csb_spmv_test.cpp
+	$(CXX) $(CFLAGS) $(DEFINES) -DNOBM -o parspmv_nobm csb_spmv_test.cpp
 
 parspmvt: csb_spmvt_test.cpp bicsb.cpp bicsb.h utility.h friends.h
-	icpc $(INCADD) $(INTELOPT) -o parspmvt csb_spmvt_test.cpp
+	$(CXX) $(CFLAGS) $(DEFINES) -o parspmvt csb_spmvt_test.cpp
 
 both_d:	both_test.cpp bicsb.cpp bicsb.h utility.h friends.h
-	icpc $(INCADD) $(INTELOPT) -o both_d both_test.cpp
+	$(CXX) $(CFLAGS) $(DEFINES) -o both_d both_test.cpp
 
 both_s:	both_test.cpp bicsb.cpp bicsb.h utility.h friends.h
-	icpc $(INCADD) $(INTELOPT) -DSINGLEPRECISION -o both_s both_test.cpp
+	$(CXX) $(CFLAGS) $(DEFINES) -DSINGLEPRECISION -o both_s both_test.cpp
 
 spmm_dall:	spmm_test.cpp bicsb.cpp bicsb.h utility.h friends.h
 	for number in 4 8 12 16 24 32 40 48 56 64; do \
-		echo "icpc $(INCADD) $(INTELOPT) -DRHSDIM=$$number -o spmm_d$$number spmm_test.cpp"; \
-		icpc $(INCADD) $(INTELOPT) -DRHSDIM=$$number -o spmm_d$$number spmm_test.cpp; \
+		echo "$(CXX) $(CFLAGS) $(DEFINES) -DRHSDIM=$$number -o spmm_d$$number spmm_test.cpp"; \
+		$(CXX) $(CFLAGS) $(DEFINES) -DRHSDIM=$$number -o spmm_d$$number spmm_test.cpp; \
 	done;
 
 spmm_a:	spmm_test.cpp bicsb.cpp bicsb.h utility.h friends.h
-	icpc $(INCADD) $(INTELOPT) -DSINGLEPRECISION -S -fcode-asm -vec_report6 spmm_test.cpp
+	$(CXX) $(CFLAGS) $(DEFINES) -DSINGLEPRECISION -S -fcode-asm -vec_report6 spmm_test.cpp
 
 spmm_sall:	spmm_test.cpp bicsb.cpp bicsb.h utility.h friends.h
 	for number in 4 8 12 16 24 32 40 48 56 64; do \
-		echo "icpc $(INCADD) $(INTELOPT) -DSINGLEPRECISION -DRHSDIM=$$number -o spmm_s$$number spmm_test.cpp"; \
-		icpc $(INCADD) $(INTELOPT) -DSINGLEPRECISION -DRHSDIM=$$number -o spmm_s$$number spmm_test.cpp; \
+		echo "$(CXX) $(CFLAGS) $(DEFINES) -DSINGLEPRECISION -DRHSDIM=$$number -o spmm_s$$number spmm_test.cpp"; \
+		$(CXX) $(CFLAGS) $(DEFINES) -DSINGLEPRECISION -DRHSDIM=$$number -o spmm_s$$number spmm_test.cpp; \
 	done;
 
 SSEspmv.o: SSEspmv.cpp
-	g++ -DAMD $(GCCOPT) -march=amdfam10 -c SSEspmv.cpp	
+	$(CXX) $(CFLAGS) $(DEFINES) -c SSEspmv.cpp	
 
 clean:	
 	rm -f seqspmv

diff --git a/Semirings.h b/Semirings.h
@@ -6,6 +6,7 @@
 #include <climits>
 #include <cmath>
 #include <tr1/array>
+#include <memory>
 #include "promote.h"
 
 template <typename T>
@@ -60,7 +61,7 @@ struct UnrollerL {
 template<int End, int Step>
 struct UnrollerL<End, End, Step> {
     template<typename Lambda>
-    static void step(Lambda& func) {
+    [[maybe_unused]] static void step([[maybe_unused]] Lambda& func) {
 		// base case is when Begin=End; do nothing
     }
 };
@@ -75,13 +76,13 @@ struct PTSRArray
 	// y <- a*x + y overload with a=1
 	static void axpy(const array<T2, D> & b, array<T_promote, D> & c)
 	{
+		// const T2 * __restrict barr =  std::assume_aligned<ALIGN>(b.data());
+		// T_promote * __restrict carr = std::assume_aligned<ALIGN>(c.data());
 		const T2 * __restrict barr =  b.data();
 		T_promote * __restrict carr = c.data();
-		__assume_aligned(barr, ALIGN);
-		__assume_aligned(carr, ALIGN);
 
 		#pragma simd
-		for(int i=0; i<D; ++i)
+		for(unsigned int i=0; i<D; ++i)
 		{
 			carr[i] +=  barr[i];
 		}
@@ -92,13 +93,13 @@ struct PTSRArray
 	// Todo: Do partial unrolling; this code will bloat for D > 32 
 	static void axpy(T1 a, const array<T2,D> & b, array<T_promote,D> & c)
 	{
+		// const T2 * __restrict barr =  std::assume_aligned<ALIGN>(b.data());
+		// T_promote * __restrict carr = std::assume_aligned<ALIGN>(c.data());
 		const T2 * __restrict barr =  b.data();
 		T_promote * __restrict carr = c.data();
-		__assume_aligned(barr, ALIGN);
-		__assume_aligned(carr, ALIGN);
 
 		#pragma simd
-		for(int i=0; i<D; ++i)
+		for(unsigned int i=0; i<D; ++i)
 		{
 			carr[i] +=  a* barr[i];
 		}	

diff --git a/aligned.h b/aligned.h
@@ -2,6 +2,7 @@
 #include <malloc.h>
 #endif
 #include <cstdint>
+#include <cstdlib>
 #include <vector>
 #include <iostream>
 using namespace std;
@@ -71,7 +72,7 @@ class aligned_allocator
 		// Returns true if and only if storage allocated from *this
 		// can be deallocated from other, and vice versa.
 		// Always returns true for stateless allocators.
-		bool operator==(const aligned_allocator& other) const
+		bool operator==([[maybe_unused]] const aligned_allocator& other) const
 		{
 			return true;
 		}
@@ -110,7 +111,7 @@ class aligned_allocator
 			}
 
 			// Mallocator wraps malloc().
-			void * const pv = _mm_malloc(n * sizeof(T), Alignment);
+			void * const pv = std::aligned_alloc(Alignment, n * sizeof(T));
 
 			// Allocators should throw std::bad_alloc in the case of memory allocation failure.
 			if (pv == NULL)
@@ -121,9 +122,9 @@ class aligned_allocator
 			return static_cast<T *>(pv);
 		}
 
-		void deallocate(T * const p, const std::size_t n) const
+		void deallocate(T * const p, [[maybe_unused]] const std::size_t n) const
 		{
-			_mm_free(p);
+			free(p);
 		}
 
 

diff --git a/bicsb.cpp b/bicsb.cpp
@@ -20,8 +20,8 @@ void BiCsb<NT, IT>::Init(int workers, IT forcelogbeta)
 	bool sizereq;
 	if (ispar)
 	{
-		sizereq = ((IntPower<2>(rowbits) > SLACKNESS * workers) 
-			&& (IntPower<2>(colbits) > SLACKNESS * workers));
+		sizereq = ((IntPower<2>(rowbits) > (unsigned int) SLACKNESS * workers) 
+			&& (IntPower<2>(colbits) > (unsigned int) SLACKNESS * workers));
 	}
 	else
 	{
@@ -43,7 +43,7 @@ void BiCsb<NT, IT>::Init(int workers, IT forcelogbeta)
 	colhighbits = colbits-collowbits;	// # higher order bits for cols (has at least one bit)
 	if(ispar)
 	{
-		while(IntPower<2>(rowhighbits) < SLACKNESS * workers)
+		while(IntPower<2>(rowhighbits) < (unsigned int) SLACKNESS * workers)
 		{
 			rowhighbits++;
 			rowlowbits--;
@@ -869,8 +869,8 @@ void BiCsb<NT, IT>::SubSpMV(IT * __restrict btop, IT bstart, IT bend, const RHS
 	IT * __restrict r_bot = bot;
 	NT * __restrict r_num = num;
 
-	__m128i lcms = _mm_set1_epi32 (lowcolmask);
-	__m128i lrms = _mm_set1_epi32 (lowrowmask);
+	[[maybe_unused]] __m128i lcms = _mm_set1_epi32 (lowcolmask);
+	[[maybe_unused]] __m128i lrms = _mm_set1_epi32 (lowrowmask);
 
 	for (IT j = bstart ; j < bend ; ++j)		// for all blocks inside that block row
 	{
@@ -1350,8 +1350,9 @@ ofstream & BiCsb<NT, IT>::PrintStats(ofstream & outfile) const
 	outfile << "## Number of real blocks is "<< ntop << endl;
 	outfile << "## Row imbalance is " << RowImbalance(*this) << endl;
 	outfile << "## Col imbalance is " << ColImbalance(*this) << endl;
+	#ifdef STATS
 	outfile << "## Block parallel calls is " << blockparcalls.get_value() << endl;
-
+	#endif
 	std::vector<int> blocksizes(ntop);
 	for(IT i=0; i<nbr; ++i)
 	{

diff --git a/bmcsb.h b/bmcsb.h
@@ -31,6 +31,7 @@ class BmCsb
 	ofstream & PrintStats(ofstream & outfile) const;
 	IT colsize() const { return n;} 
 	IT rowsize() const { return m;} 
+	IT numnonzeros() const { return nz; }
 	IT numregb() const { return nrb;}
 	bool isPar() const { return ispar; }
 
@@ -66,12 +67,12 @@ class BmCsb
 
 	IT rowlowbits;	// # lower order bits for rows
 	IT rowhighbits;
-	IT highrowmask; // mask with the first log(m)/2 bits = 1 and the other bits = 0  
+	IT highrowmask; // mask with the first log(m)/2 bits = 1 and the other bits = 0  
 	IT lowrowmask;
 
 	IT collowbits;	// # lower order bits for columns
 	IT colhighbits;
-	IT highcolmask; // mask with the first log(n)/2 bits = 1 and the other bits = 0  
+	IT highcolmask; // mask with the first log(n)/2 bits = 1 and the other bits = 0  
 	IT lowcolmask;
 
 	MortonCompare<IT> mortoncmp;	// comparison operator w.r.t. the (inverted N)-morton layout

diff --git a/both_test.cpp b/both_test.cpp
@@ -31,7 +31,7 @@ using namespace std;
 
 int main(int argc, char* argv[])
 {
-#ifndef CILK_STUB
+#if CILK==1
 	int gl_nworkers = __cilkrts_get_nworkers();
 #else
 	int gl_nworkers = 0;

diff --git a/csb_spmv_test.cpp b/csb_spmv_test.cpp
@@ -12,6 +12,11 @@
 #include "cilk_util.h"
 #include "utility.h"
 
+#ifndef RHSDIM
+	#define RHSDIM 16
+#endif
+#define ALIGN 32
+
 #include "triple.h"
 #include "csc.h"
 #include "bicsb.h"
@@ -32,7 +37,7 @@ using namespace std;
 
 int main(int argc, char* argv[])
 {
-#ifndef CILK_STUB
+#if CILK==1
 	int gl_nworkers = __cilkrts_get_nworkers();
 #else
 	int gl_nworkers = 0;

diff --git a/csb_spmvt_test.cpp b/csb_spmvt_test.cpp
@@ -30,7 +30,7 @@ INDEXTYPE flops;
 
 int main(int argc, char* argv[])
 {
-#ifndef	CILK_STUB
+#if CILK==1
 	int gl_nworkers = __cilkrts_get_nworkers();
 #else
 	int gl_nworkers = 0;

diff --git a/csc.cpp b/csc.cpp
@@ -98,7 +98,7 @@ Csc<T,ITYPE>::~Csc()
 // (a) triples only contain the upper triangular part, or (b) the whole matrix
 template <class T, class ITYPE>
 Csc<T,ITYPE>::Csc(Triple<T, ITYPE> * triples, ITYPE size, ITYPE rows, ITYPE cols, bool isSym)
-:nz(size),m(rows),n(cols),issym(isSym)
+:issym(isSym), nz(size),m(rows),n(cols)
 {
 	// Constructing empty Csc objects (size = 0) are not allowed.
 	assert(size != 0 && n != 0);
@@ -174,7 +174,7 @@ Csc<T,ITYPE>::Csc(Triple<T, ITYPE> * triples, ITYPE size, ITYPE rows, ITYPE cols
 // Construct a Csc object from parallel arrays
 template <class T, class ITYPE>
 Csc<T,ITYPE>::Csc(ITYPE * ri, ITYPE * ci, T * val, ITYPE size, ITYPE rows, ITYPE cols, bool isSym)
-:nz(size),m(rows),n(cols),issym(isSym)
+:issym(isSym),nz(size),m(rows),n(cols)
 {
 	// Constructing empty Csc objects (size = 0) are not allowed.
 	assert(size != 0 && n != 0);

diff --git a/csc.h b/csc.h
@@ -15,7 +15,7 @@ template <class T, class ITYPE>
 class Csc
 {
 public:
-	Csc ():nz(0), m(0), n(0), logicalnz(0), issym(false) {}				// default constructor
+	Csc (): issym(false), logicalnz(0), nz(0), m(0), n(0) {}				// default constructor
 	Csc (ITYPE size,ITYPE rows, ITYPE cols, bool isSym=false);
 	Csc (const Csc<T, ITYPE> & rhs);		// copy constructor
 	~Csc();

diff --git a/friends.h b/friends.h
@@ -35,7 +35,7 @@ void bmcsb_gespmv (const BmCsb<NT, IT, TTDIM> & A, const NT * __restrict x, NT *
 	double t0 = timer_seconds_since_init();
 
 	unsigned * scansum = new unsigned[A.nrb];
-	unsigned sum = prescan(scansum, A.masks, A.nrb);
+	[[maybe_unused]] unsigned sum = prescan(scansum, A.masks, A.nrb);
 
 	double t1 = timer_seconds_since_init();
 	prescantime += (t1-t0);
@@ -128,14 +128,15 @@ void bicsb_gespmv (const BiCsb<NT, IT> & A, const RHS * __restrict x, LHS * __re
 				IT thsh = BREAKEVEN * ysize;
 				vector<IT*> chunks;
 				chunks.push_back(btop);
-				for(IT j =0; j < A.nbc; )
+				for(IT j =0; j < A.nbc-1; )
 				{
 					IT count = btop[j+1] - btop[j];
 					if(count < thsh && j < A.nbc)
 					{
-						while(count < thsh && j < A.nbc)
+						while(count < thsh && j < A.nbc-1)
 						{
-							count += btop[(++j)+1] - btop[j]; 
+							j+=1;
+							count += btop[j+1] - btop[j]; 
 						}
 						chunks.push_back(btop+j);	// push, but exclude the block that caused the overflow
 					}

diff --git a/spmm_test.cpp b/spmm_test.cpp
@@ -105,12 +105,12 @@ void VerifyMM (vector< array<NT,DIM>, ALLOC > & control, vector< array<NT,DIM>,
 
 int main(int argc, char* argv[])
 {
-#ifndef CILK_STUB
+#if CILK==1
 	int gl_nworkers = __cilkrts_get_nworkers();
 #else
 	int gl_nworkers = 0;
 #endif
-	bool syminput = false;
+	[[maybe_unused]] bool syminput = false;
 	bool binary = false;
 	bool iscsc = false;
 	INDEXTYPE m = 0, n = 0, nnz = 0, forcelogbeta = 0;

diff --git a/spvec.cpp b/spvec.cpp
@@ -141,7 +141,7 @@ void Spvec<T,ITYPE>::fillzero()
 }
 
 template <typename NT, typename IT>
-void Verify(Spvec<NT, IT> & control, Spvec<NT, IT> & test, string name, IT m)
+void Verify(Spvec<NT, IT> & control, Spvec<NT, IT> & test, [[maybe_unused]] string name, IT m)
 {
     vector<NT>error(m);
     std::transform(&control[0], (&control[0])+m, &test[0], error.begin(), absdiff<NT>());