PASSIONLab
diff --git a/‎MainPage.h
+71 b/‎MainPage.h
+71
diff --git a/‎Makefile
+12 b/‎Makefile
+12
diff --git a/‎README
+61 b/‎README
+61
diff --git a/‎Semirings.h
+129 b/‎Semirings.h
+129
@@ -0,0 +1,71 @@
+/** @mainpage Compressed Sparse Blocks (CSB) Library (Cilk Plus implementation)
+*
+* @author <a href="http://gauss.cs.ucsb.edu/~aydin"> Aydın Buluç </a> 
+* (in collaboration with <a href="http://crd.lbl.gov/about/staff/amsc/scientific-computing-group-scg/hasan-metin-aktulga/">Hasan Metin Aktulga</a>, <a href="http://www.cs.berkeley.edu/~demmel/">James Demmel</a>, <a href="http://www.cs.georgetown.edu/~jfineman/">Jeremy Fineman</a>, <a href="http://www.fftw.org/~athena/">Matteo Frigo</a>, <a href="http://www.cs.ucsb.edu/~gilbert/">John Gilbert</a>, <a href="http://people.csail.mit.edu/cel/">Charles Leiserson</a>, <a href="http://crd.lbl.gov/about/staff/cds/ftg/leonid-oliker/">Lenny Oliker</a>, <a href="http://crd.lbl.gov/about/staff/cds/ftg/samuel-williams/">Sam Williams</a>).
+*
+* <i> This material is based upon work supported by the National Science Foundation under Grants No. 0540248, 0615215, 0712243, 0822896, and 0709385, by MIT Lincoln Laboratory under contract 7000012980, and by the Department of Energy, Office of Science, ASCR Contract No. DE-AC05-00OR22725. Any opinions, findings and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the National Science Foundation (NSF) and the Department of Energy (DOE). This software is released under <a href="http://en.wikipedia.org/wiki/MIT_License">the MIT license</a>.</i>
+*
+*
+* @section intro Introduction
+* The Compressed Sparse Blocks (CSB) is a storage format for sparse matrices that does not favor rows over columns (and vice-versa), hence offering performance symmetry in shared-memory parallel systems for Ax and A'x. The format is originally described in 
+ <a href="http://gauss.cs.ucsb.edu/~aydin/csb2009.pdf">this paper</a> [1]. It has been later improved through the incorporation of bitmasked register blocks in <a href="http://gauss.cs.ucsb.edu/~aydin/ipdps2011.pdf">this paper</a> [2] where an algorithm for symmetric matrices is also proposed. Finally <a href="http://gauss.cs.ucsb.edu/~aydin/ipdps14aktulga.pdf">this recent paper</a> [3] includes performance results for the multiple vector cases.
+ 
+ This library targets shared-memory parallel systems (ideally in a single NUMA domain for best performance) and implements:
+* - Sparse Matrix-Vector Multiplication (SpMV)
+* - Sparse Matrix-Transpose-Vector Multiplication (SpMV_T)
+* - Sparse Matrix-Multiple-Vector Multiplication (SpMM)
+* - Sparse Matrix-Transpose-Multiple-Vector Multiplication (SpMM_T)
+*
+* Download the <a href="csb2014.tgz">library and drivers as a tarball including the source code</a>.
+*
+* All operations can be done on an arbitrary semiring by overloading add() and multiply(), though some optimizations might not work for
+* specialized semirings. While the code is implemented using Intel Cilk Plus (which is available in Intel Compilers and GCC), it can 
+* be ported to any concurrency platform that supports efficient task-stealing such as OpenMP and TBB.
+*
+* The driver will accept matrices in text-based triples format and a binary format for faster benchmarking (created using
+* <a href="http://gauss.cs.ucsb.edu/~aydin/csb/dumpbinsparse.m">this matlab script</a>). The library also includes functions to convert from the common CSC format
+* though the conversion is serial and not optimized for performance yet. 
+* An example input in (compressed) <a href="http://gauss.cs.ucsb.edu/~aydin/csb/asic_320k.mtx.bz2"> ascii </a> and in (compressed) <a href="http://gauss.cs.ucsb.edu/~aydin/csb/asic_320k.bin.bz2">binary</a>. <br>
+*
+*
+* <b> How to run it? </b>
+
+* Read the <a href="http://gauss.cs.ucsb.edu/~aydin/csb/Makefile-2013">example makefile</a>. Here is a <a href="http://gauss.cs.ucsb.edu/~aydin/csb/README">README</a> file. <br>
+* Running this code on a 8-core Intel processor is done by the following way (similar for other executables): 
+* - make parspmv/parspmv_nobm/parspmvt (the tarball includes sample makefiles as well)
+* - CILK_NWORKERS=8 ./parspmvt ../BinaryMatrices/asic_320k.bin nosym binary <br>
+*
+* If you have multiple sockets (NUMA domains) in your machine, then you need to constrain the memory space to a single NUMA node (CSB is not designed for multiple NUMA domains - it will run, but slower).
+*
+* - export CILK_NWORKERS=8 (or 16 if hyperthreading turns out to be beneficial)
+* - numactl --cpunodebind=0 ./parspmvt ../BinaryMatrices/asic_320k.bin nosym binary <br>
+*
+* if you don't set CILK_NWORKERS, then it will run with as many hardware threads available on your machine (or numactl constrained domain).
+*
+* - ./parspmv ../BinaryMatrices/kkt_power.bin nosym binary   (using the binary format for fast I/O)
+* - ./parspmv ../TextMatrices/kkt_power.mtx nosym text (using the matrix market format)
+* - ./spmm_d$$number runs on $$number right-hand-side vectors that are randomly generated using double precision
+* - ./spmm_s$$number uses single precision for the same case
+* - ./both_d runs both parspmv and parspmv_t one after other (simulating iterative methods such as BiCG and QMR)
+* 
+*
+* <b> What does those numbers mean? </b>
+* - BiCSB: Original CSB code with minor performance fixes, nonsymmetric and without register blocking. Quite robust
+* - BmCSB: Bitmasked register blocks in action. Modify RBDIM in utility.h to try different blocking sizes (8x8, 4x4, etc). May perform better.
+* - CSC: Serial CSC implementation. For reference only.
+ 
+* Release notes:
+* - 1.2: Current version. Multiple vector support. 
+*   - A performance bug affecting A'x scaling on certain matrices is fixed.
+* - 1.1: Bitmasked register blocks, symmetric algorithm with half the bandwidth, port to Intel Cilk Plus. 
+*   - A performance bug affecting Ax scaling on certain matrices is fixed.
+*   - Minor: A bug with the parspmvt test driver is fixed, new parspmv_nobm compilation target is added for those who don't have SSE.
+*
+* - 1.0: Initial version. Support for Ax and A'x using cilk++.
+*
+* <b> Citation: </b>
+*
+* - [1] Aydın Buluç, Jeremy T. Fineman, Matteo Frigo, John R. Gilbert, and Charles E. Leiserson. <it>Parallel sparse matrix-vector and matrix-transpose-vector multiplication using compressed sparse blocks.</it> In SPAA'09: Proceedings of the 21st Annual ACM Symposium on Parallel Algorithms and Architectures, 2009.
+* - [2] Aydın Buluç, Samuel Williams, Leonid Oliker, and James Demmel. <it> Reduced-bandwidth multithreaded algorithms for sparse matrix-vector multiplication.</it> In Proceedings of the IPDPS. IEEE Computer Society, 2011
+* - [3]  H.Metin Aktulga, Aydın Buluç, Samuel Williams, and Chao Yang. <it> Optimizing sparse matrix-multiple vectors multiplication for nuclear configuration interaction calculations.</it> In Proceedings of the IPDPS. IEEE Computer Society, 2014
+*/
@@ -0,0 +1,12 @@
+GCCOPT = -O2 -fno-rtti -fno-exceptions # -ftree-vectorize
+INTELOPT = -O2 -no-ipo -fno-rtti -fno-exceptions -parallel -restrict -std=c++11 -xAVX -no-prec-div #-fno-inline-functions
+DEB = -g -DNOBM -O0 -parallel -restrict -std=c++11 
+
+
+seqspmv: csb_spmv_test.cpp bicsb.cpp bicsb.h friends.h utility.h 
+	g++ $(INCADD) $(GCCOPT) -o seqspmv csb_spmv_test.cpp 
+
+
+clean:	
+	rm -f seqspmv
+	rm -f *.o
@@ -0,0 +1,61 @@
+========================================================================
+    APPLICATION : CSB Overview
+========================================================================
+
+Author: Aydin Buluc, LBNL, [email protected]
+Date: 2/28/2014
+
+Classes
+-------
+
+CSC: 
+- Class that implements the standard "compressed sparse column" format.
+- Used for baseline comparisons.
+
+BiCSB:
+- Production (final) class that implements "compressed sparse blocks"
+- Nonzeros within a block are stored in "bit-interleaved" order
+- Described in http://dx.doi.org/10.1145/1583991.1584053
+
+BmCSB:
+- Class that implements bitmasked register blocks on top of CSB
+- Change the register block dimension inside utility.h (RBDIM), options are 2,4,8 (default is 8)
+- Decribed in http://doi.ieeecomputersociety.org/10.1109/IPDPS.2011.73
+
+CSBSYM:
+- Class that implements the symmetric algorithm
+- Decribed in http://doi.ieeecomputersociety.org/10.1109/IPDPS.2011.73
+
+SYM/CSBSYM [do not use]: 
+- Experimental class that implements a variant of "compressed sparse blocks"
+- Nonzeros within a block are stored in row-major order
+- Various optimizations are tried in this class, such as SSE, prefetching, etc.
+
+Files
+-----
+
+csb_spmv(t)_test.cpp : 
+- Driver programs for both sequential and parallel Ax and A'x runs
+- Usage "./executable matrixfile nosym/sym ascii/binary" or "./executable" in which case read the ascii file matrix.txt if exists (only nosym works for now - special support for symmetric matrices will be available soon)
+- Executables are parspmv, parspmvt, seqspmv, seqspmvt where names are self
+  explanatory.
+- For parallel execution, you can specify the number of workers by setting
+  the environmental variable CILK_NWORKERS.
+
+spmm_test.cpp:
+- Driver program for the multiple vector cases of Ax and A'x (i.e. SpMM for AX and A'X) 
+
+bwtest-mimd.cpp :
+- Usage "./bwtest-mimd -n file_1 file_2 ... file_n" 
+- Bandwidth test program that reads does SpMV's in n different matrices simultaneously
+- pthreads implementation
+
+oskispmv(t).cpp :
+- Usage "./oskispmv(t) matrixfile"
+- Compares the performance of our serial implementations with plain OSKI to reveal any anomalies
+
+utility.h :
+- Includes constants, preprocessors directives and utility functions
+
+
+/////////////////////////////////////////////////////////////////////////////
@@ -0,0 +1,129 @@
+
+#ifndef _SEMIRINGS_H_
+#define _SEMIRINGS_H_
+
+#include <utility>
+#include <climits>
+#include <cmath>
+#include <tr1/array>
+#include "promote.h"
+
+template <typename T>
+struct inf_plus{
+  T operator()(const T& a, const T& b) const {
+	T inf = std::numeric_limits<T>::max();
+    	if (a == inf || b == inf){
+      		return inf;
+    	}
+    	return a + b;
+  }
+};
+
+// (+,*) on scalars
+template <class T1, class T2>
+struct PTSR
+{
+	typedef typename promote_trait<T1,T2>::T_promote T_promote;
+
+	static T_promote add(const T1 & arg1, const T2 & arg2)
+	{
+		return (static_cast<T_promote>(arg1) +  
+			static_cast<T_promote>(arg2) );
+	}
+	static T_promote multiply(const T1 & arg1, const T2 & arg2)
+	{
+		return (static_cast<T_promote>(arg1) * 
+			static_cast<T_promote>(arg2) );
+	}
+	// y += ax overload with a=1
+	static void axpy(const T2 & x, T_promote & y)
+	{
+		y += x;
+	}
+	
+	static void axpy(T1 a, const T2 & x, T_promote & y)
+	{
+		y += a*x;
+	}
+};
+
+
+template<int Begin, int End, int Step>
+struct UnrollerL {
+    template<typename Lambda>
+    static void step(Lambda& func) {
+        func(Begin);
+        UnrollerL<Begin+Step, End, Step>::step(func);
+    }
+};
+
+template<int End, int Step>
+struct UnrollerL<End, End, Step> {
+    template<typename Lambda>
+    static void step(Lambda& func) {
+		// base case is when Begin=End; do nothing
+    }
+};
+
+
+// (+,*) on std:array's
+template<class T1, class T2, unsigned D>
+struct PTSRArray
+{
+	typedef typename promote_trait<T1,T2>::T_promote T_promote;
+
+	// y <- a*x + y overload with a=1
+	static void axpy(const array<T2, D> & b, array<T_promote, D> & c)
+	{
+		const T2 * __restrict barr =  b.data();
+		T_promote * __restrict carr = c.data();
+		__assume_aligned(barr, ALIGN);
+		__assume_aligned(carr, ALIGN);
+
+		#pragma simd
+		for(int i=0; i<D; ++i)
+		{
+			carr[i] +=  barr[i];
+		}
+		// auto multadd = [&] (int i) { c[i] +=  b[i]; };
+		// UnrollerL<0, D, 1>::step ( multadd );
+	}
+	
+	// Todo: Do partial unrolling; this code will bloat for D > 32 
+	static void axpy(T1 a, const array<T2,D> & b, array<T_promote,D> & c)
+	{
+		const T2 * __restrict barr =  b.data();
+		T_promote * __restrict carr = c.data();
+		__assume_aligned(barr, ALIGN);
+		__assume_aligned(carr, ALIGN);
+
+		#pragma simd
+		for(int i=0; i<D; ++i)
+		{
+			carr[i] +=  a* barr[i];
+		}	
+		//auto multadd = [&] (int i) { carr[i] +=  a* barr[i]; };
+		//UnrollerL<0, D, 1>::step ( multadd );	
+	}
+};
+
+// (min,+) on scalars
+template <class T1, class T2>
+struct MPSR
+{
+	typedef typename promote_trait<T1,T2>::T_promote T_promote;
+
+	static T_promote add(const T1 & arg1, const T2 & arg2)
+	{
+		return std::min<T_promote> 
+		(static_cast<T_promote>(arg1), static_cast<T_promote>(arg2));
+	}
+	static T_promote multiply(const T1 & arg1, const T2 & arg2)
+	{
+		return inf_plus< T_promote > 
+		(static_cast<T_promote>(arg1), static_cast<T_promote>(arg2));
+	}
+};
+
+
+#endif