Skip to content

NOT FOR MERGE: getting to to compile with opencilk #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 41 additions & 21 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,55 +1,75 @@
CILK = /opt/intel/composer_xe_2013.5.198/compiler
INCADD = -I$(CILK)/include -I$(CILK)/examples/include
LIBADD = -L$(CILK)/lib/intel64
CILK?=0
NATIVE?=1
OPT?=3
SANITIZE?=0

CFLAGS := -Wall -Wextra -O$(OPT) -g -std=c++20 -gdwarf-4 -fno-exceptions -Wno-unknown-pragmas -Wno-comment

ifeq ($(NATIVE),1)
CFLAGS += -march=native
endif

ifeq ($(CILK),1)
CFLAGS += -fopencilk
endif

ifeq ($(SANITIZE),1)
ifeq ($(CILK),1)
CFLAGS += -fsanitize=cilk,undefined,address -fno-omit-frame-pointer
else
CFLAGS += -fsanitize=undefined,address -fno-omit-frame-pointer
endif
endif

DEFINES := -DCILK=$(CILK)

all: parspmv both_d spmm_dall spmm_a spmm_sall

GCCOPT = -O2 -fno-rtti -fno-exceptions # -ftree-vectorize
INTELOPT = -O2 -no-ipo -fno-rtti -fno-exceptions -parallel -restrict -std=c++11 -xAVX -no-prec-div #-fno-inline-functions
DEB = -g -DNOBM -O0 -parallel -restrict -std=c++11

seqsym: sym_spmv_test.cpp csbsym.cpp csbsym.h utility.h friends.h SSEspmv.o
icpc -cilk-serialize $(INCADD) $(INTELOPT) -o seqsym sym_spmv_test.cpp SSEspmv.o
$(CXX) $(CFLAGS) $(DEFINES) -o seqsym sym_spmv_test.cpp SSEspmv.o

parsym: sym_spmv_test.cpp csbsym.cpp csbsym.h utility.h friends.h SSEspmv.o
icpc $(INCADD) $(DEB) -o parsym sym_spmv_test.cpp SSEspmv.o
$(CXX) $(CFLAGS) $(DEFINES) -o parsym sym_spmv_test.cpp SSEspmv.o

symanal: sym_spmv_test.cpp csbsym.cpp csbsym.h utility.h friends.h SSEspmv.o
icpc -DSTATS $(INCADD) $(INTELOPT) -o symanal sym_spmv_test.cpp SSEspmv.o -lcilkutil
$(CXX) $(CFLAGS) $(DEFINES) -o symanal sym_spmv_test.cpp SSEspmv.o

seqspmv: csb_spmv_test.cpp bicsb.cpp bicsb.h bmcsb.cpp bmcsb.h friends.h utility.h SSEspmv.o
icpc -cilk-serialize $(INCADD) $(INTELOPT) -o seqspmv csb_spmv_test.cpp SSEspmv.o
$(CXX) $(CFLAGS) $(DEFINES) -o seqspmv csb_spmv_test.cpp SSEspmv.o

parspmv: csb_spmv_test.cpp bicsb.cpp bicsb.h bmcsb.cpp bmcsb.h friends.h utility.h SSEspmv.o
icpc $(INCADD) $(INTELOPT) -o parspmv csb_spmv_test.cpp SSEspmv.o
$(CXX) $(CFLAGS) $(DEFINES) -o parspmv csb_spmv_test.cpp SSEspmv.o

parspmv_nobm: csb_spmv_test.cpp bicsb.cpp bicsb.h friends.h utility.h
icpc $(INCADD) $(INTELOPT) -DNOBM -o parspmv_nobm csb_spmv_test.cpp
$(CXX) $(CFLAGS) $(DEFINES) -DNOBM -o parspmv_nobm csb_spmv_test.cpp

parspmvt: csb_spmvt_test.cpp bicsb.cpp bicsb.h utility.h friends.h
icpc $(INCADD) $(INTELOPT) -o parspmvt csb_spmvt_test.cpp
$(CXX) $(CFLAGS) $(DEFINES) -o parspmvt csb_spmvt_test.cpp

both_d: both_test.cpp bicsb.cpp bicsb.h utility.h friends.h
icpc $(INCADD) $(INTELOPT) -o both_d both_test.cpp
$(CXX) $(CFLAGS) $(DEFINES) -o both_d both_test.cpp

both_s: both_test.cpp bicsb.cpp bicsb.h utility.h friends.h
icpc $(INCADD) $(INTELOPT) -DSINGLEPRECISION -o both_s both_test.cpp
$(CXX) $(CFLAGS) $(DEFINES) -DSINGLEPRECISION -o both_s both_test.cpp

spmm_dall: spmm_test.cpp bicsb.cpp bicsb.h utility.h friends.h
for number in 4 8 12 16 24 32 40 48 56 64; do \
echo "icpc $(INCADD) $(INTELOPT) -DRHSDIM=$$number -o spmm_d$$number spmm_test.cpp"; \
icpc $(INCADD) $(INTELOPT) -DRHSDIM=$$number -o spmm_d$$number spmm_test.cpp; \
echo "$(CXX) $(CFLAGS) $(DEFINES) -DRHSDIM=$$number -o spmm_d$$number spmm_test.cpp"; \
$(CXX) $(CFLAGS) $(DEFINES) -DRHSDIM=$$number -o spmm_d$$number spmm_test.cpp; \
done;

spmm_a: spmm_test.cpp bicsb.cpp bicsb.h utility.h friends.h
icpc $(INCADD) $(INTELOPT) -DSINGLEPRECISION -S -fcode-asm -vec_report6 spmm_test.cpp
$(CXX) $(CFLAGS) $(DEFINES) -DSINGLEPRECISION -S -fcode-asm -vec_report6 spmm_test.cpp

spmm_sall: spmm_test.cpp bicsb.cpp bicsb.h utility.h friends.h
for number in 4 8 12 16 24 32 40 48 56 64; do \
echo "icpc $(INCADD) $(INTELOPT) -DSINGLEPRECISION -DRHSDIM=$$number -o spmm_s$$number spmm_test.cpp"; \
icpc $(INCADD) $(INTELOPT) -DSINGLEPRECISION -DRHSDIM=$$number -o spmm_s$$number spmm_test.cpp; \
echo "$(CXX) $(CFLAGS) $(DEFINES) -DSINGLEPRECISION -DRHSDIM=$$number -o spmm_s$$number spmm_test.cpp"; \
$(CXX) $(CFLAGS) $(DEFINES) -DSINGLEPRECISION -DRHSDIM=$$number -o spmm_s$$number spmm_test.cpp; \
done;

SSEspmv.o: SSEspmv.cpp
g++ -DAMD $(GCCOPT) -march=amdfam10 -c SSEspmv.cpp
$(CXX) $(CFLAGS) $(DEFINES) -c SSEspmv.cpp

clean:
rm -f seqspmv
Expand Down
15 changes: 8 additions & 7 deletions Semirings.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <climits>
#include <cmath>
#include <tr1/array>
#include <memory>
#include "promote.h"

template <typename T>
Expand Down Expand Up @@ -60,7 +61,7 @@ struct UnrollerL {
template<int End, int Step>
struct UnrollerL<End, End, Step> {
template<typename Lambda>
static void step(Lambda& func) {
[[maybe_unused]] static void step([[maybe_unused]] Lambda& func) {
// base case is when Begin=End; do nothing
}
};
Expand All @@ -75,13 +76,13 @@ struct PTSRArray
// y <- a*x + y overload with a=1
static void axpy(const array<T2, D> & b, array<T_promote, D> & c)
{
// const T2 * __restrict barr = std::assume_aligned<ALIGN>(b.data());
// T_promote * __restrict carr = std::assume_aligned<ALIGN>(c.data());
const T2 * __restrict barr = b.data();
T_promote * __restrict carr = c.data();
__assume_aligned(barr, ALIGN);
__assume_aligned(carr, ALIGN);

#pragma simd
for(int i=0; i<D; ++i)
for(unsigned int i=0; i<D; ++i)
{
carr[i] += barr[i];
}
Expand All @@ -92,13 +93,13 @@ struct PTSRArray
// Todo: Do partial unrolling; this code will bloat for D > 32
static void axpy(T1 a, const array<T2,D> & b, array<T_promote,D> & c)
{
// const T2 * __restrict barr = std::assume_aligned<ALIGN>(b.data());
// T_promote * __restrict carr = std::assume_aligned<ALIGN>(c.data());
const T2 * __restrict barr = b.data();
T_promote * __restrict carr = c.data();
__assume_aligned(barr, ALIGN);
__assume_aligned(carr, ALIGN);

#pragma simd
for(int i=0; i<D; ++i)
for(unsigned int i=0; i<D; ++i)
{
carr[i] += a* barr[i];
}
Expand Down
9 changes: 5 additions & 4 deletions aligned.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <malloc.h>
#endif
#include <cstdint>
#include <cstdlib>
#include <vector>
#include <iostream>
using namespace std;
Expand Down Expand Up @@ -71,7 +72,7 @@ class aligned_allocator
// Returns true if and only if storage allocated from *this
// can be deallocated from other, and vice versa.
// Always returns true for stateless allocators.
bool operator==(const aligned_allocator& other) const
bool operator==([[maybe_unused]] const aligned_allocator& other) const
{
return true;
}
Expand Down Expand Up @@ -110,7 +111,7 @@ class aligned_allocator
}

// Mallocator wraps malloc().
void * const pv = _mm_malloc(n * sizeof(T), Alignment);
void * const pv = std::aligned_alloc(Alignment, n * sizeof(T));

// Allocators should throw std::bad_alloc in the case of memory allocation failure.
if (pv == NULL)
Expand All @@ -121,9 +122,9 @@ class aligned_allocator
return static_cast<T *>(pv);
}

void deallocate(T * const p, const std::size_t n) const
void deallocate(T * const p, [[maybe_unused]] const std::size_t n) const
{
_mm_free(p);
free(p);
}


Expand Down
13 changes: 7 additions & 6 deletions bicsb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ void BiCsb<NT, IT>::Init(int workers, IT forcelogbeta)
bool sizereq;
if (ispar)
{
sizereq = ((IntPower<2>(rowbits) > SLACKNESS * workers)
&& (IntPower<2>(colbits) > SLACKNESS * workers));
sizereq = ((IntPower<2>(rowbits) > (unsigned int) SLACKNESS * workers)
&& (IntPower<2>(colbits) > (unsigned int) SLACKNESS * workers));
}
else
{
Expand All @@ -43,7 +43,7 @@ void BiCsb<NT, IT>::Init(int workers, IT forcelogbeta)
colhighbits = colbits-collowbits; // # higher order bits for cols (has at least one bit)
if(ispar)
{
while(IntPower<2>(rowhighbits) < SLACKNESS * workers)
while(IntPower<2>(rowhighbits) < (unsigned int) SLACKNESS * workers)
{
rowhighbits++;
rowlowbits--;
Expand Down Expand Up @@ -869,8 +869,8 @@ void BiCsb<NT, IT>::SubSpMV(IT * __restrict btop, IT bstart, IT bend, const RHS
IT * __restrict r_bot = bot;
NT * __restrict r_num = num;

__m128i lcms = _mm_set1_epi32 (lowcolmask);
__m128i lrms = _mm_set1_epi32 (lowrowmask);
[[maybe_unused]] __m128i lcms = _mm_set1_epi32 (lowcolmask);
[[maybe_unused]] __m128i lrms = _mm_set1_epi32 (lowrowmask);

for (IT j = bstart ; j < bend ; ++j) // for all blocks inside that block row
{
Expand Down Expand Up @@ -1350,8 +1350,9 @@ ofstream & BiCsb<NT, IT>::PrintStats(ofstream & outfile) const
outfile << "## Number of real blocks is "<< ntop << endl;
outfile << "## Row imbalance is " << RowImbalance(*this) << endl;
outfile << "## Col imbalance is " << ColImbalance(*this) << endl;
#ifdef STATS
outfile << "## Block parallel calls is " << blockparcalls.get_value() << endl;

#endif
std::vector<int> blocksizes(ntop);
for(IT i=0; i<nbr; ++i)
{
Expand Down
5 changes: 3 additions & 2 deletions bmcsb.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class BmCsb
ofstream & PrintStats(ofstream & outfile) const;
IT colsize() const { return n;}
IT rowsize() const { return m;}
IT numnonzeros() const { return nz; }
IT numregb() const { return nrb;}
bool isPar() const { return ispar; }

Expand Down Expand Up @@ -66,12 +67,12 @@ class BmCsb

IT rowlowbits; // # lower order bits for rows
IT rowhighbits;
IT highrowmask; // mask with the first log(m)/2 bits = 1 and the other bits = 0
IT highrowmask; // mask with the first log(m)/2 bits = 1 and the other bits = 0
IT lowrowmask;

IT collowbits; // # lower order bits for columns
IT colhighbits;
IT highcolmask; // mask with the first log(n)/2 bits = 1 and the other bits = 0
IT highcolmask; // mask with the first log(n)/2 bits = 1 and the other bits = 0
IT lowcolmask;

MortonCompare<IT> mortoncmp; // comparison operator w.r.t. the (inverted N)-morton layout
Expand Down
2 changes: 1 addition & 1 deletion both_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ using namespace std;

int main(int argc, char* argv[])
{
#ifndef CILK_STUB
#if CILK==1
int gl_nworkers = __cilkrts_get_nworkers();
#else
int gl_nworkers = 0;
Expand Down
7 changes: 6 additions & 1 deletion csb_spmv_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
#include "cilk_util.h"
#include "utility.h"

#ifndef RHSDIM
#define RHSDIM 16
#endif
#define ALIGN 32

#include "triple.h"
#include "csc.h"
#include "bicsb.h"
Expand All @@ -32,7 +37,7 @@ using namespace std;

int main(int argc, char* argv[])
{
#ifndef CILK_STUB
#if CILK==1
int gl_nworkers = __cilkrts_get_nworkers();
#else
int gl_nworkers = 0;
Expand Down
2 changes: 1 addition & 1 deletion csb_spmvt_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ INDEXTYPE flops;

int main(int argc, char* argv[])
{
#ifndef CILK_STUB
#if CILK==1
int gl_nworkers = __cilkrts_get_nworkers();
#else
int gl_nworkers = 0;
Expand Down
4 changes: 2 additions & 2 deletions csc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ Csc<T,ITYPE>::~Csc()
// (a) triples only contain the upper triangular part, or (b) the whole matrix
template <class T, class ITYPE>
Csc<T,ITYPE>::Csc(Triple<T, ITYPE> * triples, ITYPE size, ITYPE rows, ITYPE cols, bool isSym)
:nz(size),m(rows),n(cols),issym(isSym)
:issym(isSym), nz(size),m(rows),n(cols)
{
// Constructing empty Csc objects (size = 0) are not allowed.
assert(size != 0 && n != 0);
Expand Down Expand Up @@ -174,7 +174,7 @@ Csc<T,ITYPE>::Csc(Triple<T, ITYPE> * triples, ITYPE size, ITYPE rows, ITYPE cols
// Construct a Csc object from parallel arrays
template <class T, class ITYPE>
Csc<T,ITYPE>::Csc(ITYPE * ri, ITYPE * ci, T * val, ITYPE size, ITYPE rows, ITYPE cols, bool isSym)
:nz(size),m(rows),n(cols),issym(isSym)
:issym(isSym),nz(size),m(rows),n(cols)
{
// Constructing empty Csc objects (size = 0) are not allowed.
assert(size != 0 && n != 0);
Expand Down
2 changes: 1 addition & 1 deletion csc.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ template <class T, class ITYPE>
class Csc
{
public:
Csc ():nz(0), m(0), n(0), logicalnz(0), issym(false) {} // default constructor
Csc (): issym(false), logicalnz(0), nz(0), m(0), n(0) {} // default constructor
Csc (ITYPE size,ITYPE rows, ITYPE cols, bool isSym=false);
Csc (const Csc<T, ITYPE> & rhs); // copy constructor
~Csc();
Expand Down
9 changes: 5 additions & 4 deletions friends.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ void bmcsb_gespmv (const BmCsb<NT, IT, TTDIM> & A, const NT * __restrict x, NT *
double t0 = timer_seconds_since_init();

unsigned * scansum = new unsigned[A.nrb];
unsigned sum = prescan(scansum, A.masks, A.nrb);
[[maybe_unused]] unsigned sum = prescan(scansum, A.masks, A.nrb);

double t1 = timer_seconds_since_init();
prescantime += (t1-t0);
Expand Down Expand Up @@ -128,14 +128,15 @@ void bicsb_gespmv (const BiCsb<NT, IT> & A, const RHS * __restrict x, LHS * __re
IT thsh = BREAKEVEN * ysize;
vector<IT*> chunks;
chunks.push_back(btop);
for(IT j =0; j < A.nbc; )
for(IT j =0; j < A.nbc-1; )
{
IT count = btop[j+1] - btop[j];
if(count < thsh && j < A.nbc)
{
while(count < thsh && j < A.nbc)
while(count < thsh && j < A.nbc-1)
{
count += btop[(++j)+1] - btop[j];
j+=1;
count += btop[j+1] - btop[j];
}
chunks.push_back(btop+j); // push, but exclude the block that caused the overflow
}
Expand Down
4 changes: 2 additions & 2 deletions spmm_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,12 @@ void VerifyMM (vector< array<NT,DIM>, ALLOC > & control, vector< array<NT,DIM>,

int main(int argc, char* argv[])
{
#ifndef CILK_STUB
#if CILK==1
int gl_nworkers = __cilkrts_get_nworkers();
#else
int gl_nworkers = 0;
#endif
bool syminput = false;
[[maybe_unused]] bool syminput = false;
bool binary = false;
bool iscsc = false;
INDEXTYPE m = 0, n = 0, nnz = 0, forcelogbeta = 0;
Expand Down
2 changes: 1 addition & 1 deletion spvec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ void Spvec<T,ITYPE>::fillzero()
}

template <typename NT, typename IT>
void Verify(Spvec<NT, IT> & control, Spvec<NT, IT> & test, string name, IT m)
void Verify(Spvec<NT, IT> & control, Spvec<NT, IT> & test, [[maybe_unused]] string name, IT m)
{
vector<NT>error(m);
std::transform(&control[0], (&control[0])+m, &test[0], error.begin(), absdiff<NT>());
Expand Down
Loading