matrixorigin
diff --git a/‎.gitignore
Lines changed: 1 addition & 0 deletions b/‎.gitignore
Lines changed: 1 addition & 0 deletions
diff --git a/‎Makefile
Lines changed: 6 additions & 1 deletion b/‎Makefile
Lines changed: 6 additions & 1 deletion
diff --git a/‎cgo/Makefile
Lines changed: 26 additions & 5 deletions b/‎cgo/Makefile
Lines changed: 26 additions & 5 deletions
diff --git a/‎cgo/bitmap.h
Lines changed: 3 additions & 1 deletion b/‎cgo/bitmap.h
Lines changed: 3 additions & 1 deletion
diff --git a/‎cgo/cuda/.gitignore
Lines changed: 2 additions & 0 deletions b/‎cgo/cuda/.gitignore
Lines changed: 2 additions & 0 deletions
@@ -4,6 +4,7 @@
 *.exe
 *.lib
 *.swp
+*.fatbin
 bin/
 y.go
 *.output
 
@@ -110,8 +110,13 @@ THIRDPARTIES_INSTALL_DIR=$(ROOT_DIR)/thirdparties/install
 RACE_OPT :=
 DEBUG_OPT :=
 CGO_DEBUG_OPT :=
+
+ifeq ($(MO_CL_CUDA),1)
+	CUDA_LDFLAGS := -L/usr/local/cuda/lib64/stubs -lcuda -L/usr/local/cuda/lib64 -lcudart -lstdc++
+endif
+
 CGO_OPTS :=CGO_CFLAGS="-I$(THIRDPARTIES_INSTALL_DIR)/include"
-GOLDFLAGS=-ldflags="-extldflags '-L$(THIRDPARTIES_INSTALL_DIR)/lib -Wl,-rpath,\$${ORIGIN}/lib' $(VERSION_INFO)"
+GOLDFLAGS=-ldflags="-extldflags '$(CUDA_LDFLAGS) -L$(THIRDPARTIES_INSTALL_DIR)/lib -Wl,-rpath,\$${ORIGIN}/lib' $(VERSION_INFO)"
 TAGS :=
 
 ifeq ("$(UNAME_S)","darwin")
 
@@ -1,12 +1,32 @@
 DEBUG_OPT :=
-OPT_LV := -O3
+
+# Yeah, fast math.  We want it to be fast, for all xcall, 
+# IEEE compliance should not be an issue.
+OPT_LV := -O3 -ffast-math
 CFLAGS=-std=c99 -g ${OPT_LV} -Wall -Werror
-OBJS=mo.o arith.o compare.o logic.o
+OBJS=mo.o arith.o compare.o logic.o xcall.o
+CUDA_OBJS=
+
+ifeq ($(MO_CL_CUDA),1)
+	CC = /usr/local/cuda/bin/nvcc 
+	CFLAGS = -ccbin g++ -m64 --shared -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
+	CFLAGS += -DMO_CL_CUDA
+	CUDA_OBJS += cuda/cuda.o
+	CUDA_LDFLAGS := -L/usr/local/cuda/lib64/stubs -lcuda -L/usr/local/cuda/lib64 -lcudart -lstdc++
+endif
 
 all: libmo.a
 
-libmo.a: $(OBJS)
-	ar -rcs libmo.a *.o
+libmo.a: $(OBJS) 
+ifeq ($(MO_CL_CUDA),1)
+	make -C cuda
+endif
+	ar -rcs libmo.a $(OBJS) $(CUDA_OBJS)
+
+# 
+#	$(CC) -o libmo.a $(OBJS) $(CUDA_OBJS) $(CUDA_LDFLAGS)
+
+
 
 .PHONY: debug
 debug: override OPT_LV := -O0
@@ -15,4 +35,5 @@ debug: all
 
 .PHONY: clean
 clean:
-	rm -f *.o *.a
+	rm -f *.o *.a *.so
+	make -C cuda clean
@@ -17,7 +17,6 @@
 #ifndef _BITMAP_H_
 #define _BITMAP_H_
 
-#include "mo.h"
 #include <string.h>
 
 /*
@@ -46,6 +45,9 @@ static inline uint64_t bitmap_pos2mask(uint64_t pos) {
 }
 
 static inline bool bitmap_test(uint64_t *p, uint64_t pos) {
+    if (p == NULL) {
+        return false;
+    }
     return (p[bitmap_pos2idx(pos)] & bitmap_pos2mask(pos)) != 0;
 }
 
 
@@ -0,0 +1,2 @@
+mytest
+*.fatbin
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,6 @@`
`17`	`17`	`#ifndef _BITMAP_H_`
`18`	`18`	`#define _BITMAP_H_`
`19`	`19`
`20`		`-#include "mo.h"`
`21`	`20`	`#include <string.h>`
`22`	`21`
`23`	`22`	`/*`
`@@ -46,6 +45,9 @@ static inline uint64_t bitmap_pos2mask(uint64_t pos) {`
`46`	`45`	`}`
`47`	`46`
`48`	`47`	`static inline bool bitmap_test(uint64_t *p, uint64_t pos) {`
	`48`	`+ if (p == NULL) {`
	`49`	`+ return false;`
	`50`	`+ }`
`49`	`51`	`return (p[bitmap_pos2idx(pos)] & bitmap_pos2mask(pos)) != 0;`
`50`	`52`	`}`
`51`	`53`