runq - remove blas & optimize

trholding · trholding · commit 036d7cb9f254 · 2024-07-20T17:44:29.000+05:30
runq - optimize matmul and quantization functions with OpenMP
diff --git a/Makefile b/Makefile
@@ -90,55 +90,55 @@ run_cc_openmp: ##		- OpenMP accelerated build
 
 .PHONY: runq_cc_openmp
 runq_cc_openmp: ##		- Same for quantized build
-	$(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c  $(BOLT) -lm  -o run
+	$(CC) -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native runq.c  $(BOLT) -lm  -o run
 
 .PHONY: run_cc_openacc
 run_cc_openacc: ##		- OpenACC accelerated build
 	$(CC) -D OPENACC -Ofast -fopenacc -march=native -mtune=native run.c  $(BOLT) -lm  -o run	
 
 .PHONY: runq_cc_openacc
 runq_cc_openacc: ##		- Same for quantized build
-	$(CC) -D OPENACC -Ofast -fopenacc -march=native -mtune=native runq.c  $(BOLT) -lm  -o run	
+	$(CC) -D OPENACC -D CAT -Ofast -fopenacc -march=native -mtune=native runq.c  $(BOLT) -lm  -o run	
 
 .PHONY: run_cc_omp_gnu
 run_cc_omp_gnu: ##		- Generic linux distro + OpenMP build
 	$(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native -std=gnu11 run.c  $(BOLT) -lm  -o run
 
 .PHONY: runq_cc_omp_gnu
 runq_cc_omp_gnu: ##		- Same for quantized build
-	$(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native -std=gnu11 runq.c  $(BOLT) -lm  -o run
+	$(CC) -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native -std=gnu11 runq.c  $(BOLT) -lm  -o run
 
 .PHONY: run_cc_clblast
 run_cc_clblast: ##		- CLBlast OpenCL CBLAS GPU accelerated build
 	$(CC) -D OPENMP -D CLBLAST -Ofast -fopenmp -march=native -mtune=native run.c $(BOLT) -lm -lclblast -o run
 
 .PHONY: runq_cc_clblast
 runq_cc_clblast: ##		- Same for quantized build
-	$(CC) -D OPENMP -D CLBLAST -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lclblast -o run
+	$(CC) -D OPENMP -D CAT -D CLBLAST -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lclblast -o run
 
 .PHONY: run_cc_openblas
 run_cc_openblas: ##		- Openblas CBLAS accelerated build
 	$(CC) -D OPENMP -D OPENBLAS -Ofast -fopenmp -march=native -mtune=native -I$(OPENBLAS_INC) run.c $(BOLT) -lm -lopenblas -o run
 
 .PHONY: runq_cc_openblas
 runq_cc_openblas: ##		- Same for quantized build
-	$(CC) -D OPENMP -D OPENBLAS -Ofast -fopenmp -march=native -mtune=native -I$(OPENBLAS_INC) runq.c $(BOLT) -lm -lopenblas -o run
+	$(CC) -D OPENMP -D CAT -D OPENBLAS -Ofast -fopenmp -march=native -mtune=native -I$(OPENBLAS_INC) runq.c $(BOLT) -lm -lopenblas -o run
 
 .PHONY: run_cc_cblas
 run_cc_cblas: ##		- Generic CBLAS accelerated build
 	$(CC) -D OPENMP -D CBLAS -Ofast -fopenmp -march=native -mtune=native run.c $(BOLT) -lm -lcblas -o run
 
 .PHONY: runq_cc_cblas
 runq_cc_cblas: ##		- Same for quantized build
-	$(CC) -D OPENMP -D CBLAS -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lcblas -o run
+	$(CC) -D OPENMP -D CAT -D CBLAS -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -lcblas -o run
 
 .PHONY: run_cc_blis
 run_cc_blis: ##		- BLIS accelerated build
 	$(CC) -D OPENMP -D BLIS -Ofast -fopenmp -march=native -mtune=native -I$(BLIS_INC) run.c $(BOLT) -lm -lblis -o run
 	
 .PHONY: runq_cc_blis
 runq_cc_blis: ##		- Same for quantized build
-	$(CC) -D OPENMP -D BLIS -Ofast -fopenmp -march=native -mtune=native -I$(BLIS_INC) runq.c $(BOLT) -lm -lblis -o run
+	$(CC) -D OPENMP -D CAT -D BLIS -Ofast -fopenmp -march=native -mtune=native -I$(BLIS_INC) runq.c $(BOLT) -lm -lblis -o run
 
 ##@ Special Builds 
 ##@ ---> x86_64
@@ -149,7 +149,7 @@ run_cc_mkl: ##		- ***NEW*** OpenMP + Intel MKL CBLAS build (x86_64 / intel Mac)
 
 .PHONY: runq_cc_mkl 
 runq_cc_mkl: ##		- Same for quantized build
-	$(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) runq.c -lmkl_rt -lpthread $(BOLT) -lm -o run	
+	$(CC) -D MKL -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native -I$(MKL_INC) -L$(MKL_LIB) runq.c -lmkl_rt -lpthread $(BOLT) -lm -o run	
 
 ##@ ---> ARM64 / aarch64
 .PHONY: run_cc_armpl
@@ -158,7 +158,7 @@ run_cc_armpl: ##		- ARM PL BLAS accelerated build (aarch64)
 
 .PHONY: runq_cc_armpl
 runq_cc_armpl: ##		- Same for quantized build
-	$(CC) -D ARMPL -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -larmpl_lp64_mp -o run
+	$(CC) -D ARMPL -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -larmpl_lp64_mp -o run
 
 ##@ ---> Macintosh
 .PHONY: run_cc_mac_accel
@@ -167,7 +167,7 @@ run_cc_mac_accel: ##		- Mac OS OPENMP + CBLAS via Accelerate Framework build (WI
 
 .PHONY: runq_cc_mac_accel
 runq_cc_mac_accel: ##		- Same for quantized build
-	$(CC) -D AAF -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -framework Accelerate -o run
+	$(CC) -D AAF -D OPENMP -D CAT -Ofast -fopenmp -march=native -mtune=native runq.c $(BOLT) -lm -framework Accelerate -o run
 
 ##@ ---> Windows
 .PHONY: run_win64
diff --git a/runq.c b/runq.c
@@ -129,8 +129,10 @@ __static_yoink("zipos");
 
 // Portable OpenMP and OpenACC pragma macros
 #ifdef OPENMP
+#define ACCELS() MK_PRAGMA(omp parallel for)
 #define ACCEL(...) MK_PRAGMA(omp parallel for private(__VA_ARGS__))
 #elif defined(OPENACC)
+#define ACCELS() MK_PRAGMA(acc parallel loop)
 #define ACCEL(...) MK_PRAGMA(acc parallel loop private(__VA_ARGS__))
 #endif
 
@@ -154,7 +156,13 @@ __static_yoink("zipos");
 #endif
 // ----------------------------------------------------------------------------
 // Globals
+// L2E Addition
+#if defined CAT
+const int GS = 64; // group size 64 for Cheap Acceleration Tech :)
+#else
 int GS = 0; // group size global for quantization of the weights
+#endif
+// END L2E Addition
 
 // ----------------------------------------------------------------------------
 // Transformer model
@@ -275,6 +283,11 @@ void free_run_state(RunState* s) {
 // Quantization functions
 
 void dequantize(QuantizedTensor *qx, float* x, int n) {
+// L2E Addition
+    #ifdef ACCEL
+    ACCELS() // OMP/OACC Macro
+    #endif
+// END L2E Addition
     for (int i = 0; i < n; i++) {
         x[i] = qx->q[i] * qx->s[i / GS];
     }
@@ -284,6 +297,11 @@ void quantize(QuantizedTensor *qx, float* x, int n) {
     int num_groups = n / GS;
     float Q_MAX = 127.0f;
 
+// L2E Addition
+    #ifdef ACCEL
+    ACCELS() // OMP/OACC Macro
+    #endif
+// END L2E Addition
     for (int group = 0; group < num_groups; group++) {
 
         // find the max absolute value in the current group
@@ -391,7 +409,11 @@ void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weigh
     int group_size = *(int*) ptr;
     ptr += sizeof(int);
 
+// L2E Addition
+    #ifndef CAT
     GS = group_size; // set as global, as it will be used in many places
+    #endif
+// END L2E Addition
 
     void* weights_ptr = ((char*)*data) + header_size; // skip header bytes
     memory_map_weights(weights, config, weights_ptr, shared_classifier);
@@ -419,7 +441,13 @@ void read_checkpoint(char* checkpoint, Config* config, TransformerWeights* weigh
     if (fread(&shared_classifier, sizeof(uint8_t), 1, file) != 1) { exit(EXIT_FAILURE); }
     int group_size; // the group size used in quantization
     if (fread(&group_size, sizeof(int), 1, file) != 1) { exit(EXIT_FAILURE); }
+
+// L2E Addition
+    #ifndef CAT
     GS = group_size; // set as global, as it will be used in many places
+    #endif
+// END L2E Addition
+
     // figure out the file size
     fseek(file, 0, SEEK_END); // move file pointer to end of file
     *file_size = ftell(file); // get the file size, in bytes
@@ -508,64 +536,77 @@ void softmax(float* x, int size) {
     }
 }
 
+// L2E Addition
+#ifdef CAT
+
 void matmul(float* xout, QuantizedTensor *x, QuantizedTensor *w, int n, int d) {
     // W (d,n) @ x (n,) -> xout (d,)
     // by far the most amount of time is spent inside this little function
     // inputs to this function are both quantized
 
-// L2E Addition
-
-    #ifdef BLAS
     int i;
-    int j;
-    
-    // Convert quantized tensors to floating point
-    float* w_fp = malloc(d * n * sizeof(float));
-    float* x_fp = malloc(n * sizeof(float));
-
     #ifdef ACCEL
-    ACCEL(i, j) // OMP/OACC Macro
-    #endif     
+    ACCEL(i) // OMP/OACC Macro
+    #endif
     for (i = 0; i < d; i++) {
-        for (j = 0; j < n; j++) {
-            w_fp[i * n + j] = ((float) w->q[i * n + j]) * w->s[i / GS];
+
+        float val = 0.0f;
+        int32_t ival = 0;
+        int in = i * n;
+
+        // do the matmul in groups of GS
+        int j;
+        for (j = 0; j <= n - GS; j += GS) {
+            // unroll the inner loop by a factor of 4
+            for (int k = 0; k < GS; k += 4) {
+                ival += ((int32_t) x->q[j + k]) * ((int32_t) w->q[in + j + k]);
+                ival += ((int32_t) x->q[j + k + 1]) * ((int32_t) w->q[in + j + k + 1]);
+                ival += ((int32_t) x->q[j + k + 2]) * ((int32_t) w->q[in + j + k + 2]);
+                ival += ((int32_t) x->q[j + k + 3]) * ((int32_t) w->q[in + j + k + 3]);
+            }
+            val += ((float) ival) * w->s[(in + j) / GS] * x->s[j / GS];
+            ival = 0;
         }
-    }
 
-    #ifdef ACCEL
-    ACCEL(j) // OMP/OACC Macro
-    #endif    
-    for (j = 0; j < n; j++) {
-        x_fp[j] = ((float) x->q[j]) * x->s[j / GS];
+        xout[i] = val;
     }
+}
 
-    cblas_sgemv(CblasRowMajor, CblasNoTrans, d, n, 1.0f, w_fp, n, x_fp, 1, 0.0f, xout, 1);
-
-    // Free memory
-    free(w_fp);
-    free(x_fp);
+#else
+// END L2E Addition
+void matmul(float* xout, QuantizedTensor *x, QuantizedTensor *w, int n, int d) {
+    // W (d,n) @ x (n,) -> xout (d,)
+    // by far the most amount of time is spent inside this little function
+    // inputs to this function are both quantized
 
-    #else
+    int i;
+// L2E Addition
+    #ifdef ACCEL
+    ACCEL(i) // OMP/OACC Macro
+    #endif
 // END L2E Addition
-    for (int i = 0; i < d; i++) {
+    for (i = 0; i < d; i++) {
+
         float val = 0.0f;
         int32_t ival = 0;
         int in = i * n;
 
         // do the matmul in groups of GS
-        for (int j = 0; j <= n - GS; j += GS) {
+        int j;
+        for (j = 0; j <= n - GS; j += GS) {
             for (int k = 0; k < GS; k++) {
                 ival += ((int32_t) x->q[j + k]) * ((int32_t) w->q[in + j + k]);
             }
             val += ((float) ival) * w->s[(in + j) / GS] * x->s[j / GS];
             ival = 0;
         }
+
         xout[i] = val;
     }
+}
 // L2E Addition
-    #endif 
+#endif 
 // END L2E Addition 
-}
 
 float* forward(Transformer* transformer, int token, int pos) {