Skip to content

Commit 363bbd8

Browse files
authored
Fengttt cl (#18212)
## What type of PR is this? - [ ] API-change - [ ] BUG - [ ] Improvement - [ ] Documentation - [x] Feature - [ ] Test and CI - [ ] Code Refactoring ## Which issue(s) this PR fixes: issue #18213 ## What this PR does / why we need it: Added a generic call method, which can let function framework to issue generic RPC calls to a cl_host. Only a special cl_host, "CGO", is implemented at this time. CGO is treated as an special cl_host so there is no need for memcpy or network RPC. Function calls can have a cl_runtime, this PR support "C", and "CUDA". l2distance of f32/f64 version are used as example to show how to offload the computing to "C" or "CUDA". For "C", the performance is roughly the same as pure go (so you may ask why should we do this ...). Same for vecf32(128) and 10%-15% faster on vecf32(960). "CUDA" as a demo, works. Build and tested on 4090, wsl. The PR has all the infrastructure to send data from database to GPU, run GPU kernel and return result. CUDA shines when some real computing intensive workload. Deep network, bitcoin, image, ... All ready to plugin. To build: A simple `make clean; make` will build "CGO" cl_host and "C" cl_runtime. `MO_CL_CUDA=1 make` will build "CUDA" cl_runtime. You must have cuda installed already.
1 parent 8c6233b commit 363bbd8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

89 files changed

+45548
-10
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
*.exe
55
*.lib
66
*.swp
7+
*.fatbin
78
bin/
89
y.go
910
*.output

Makefile

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,13 @@ THIRDPARTIES_INSTALL_DIR=$(ROOT_DIR)/thirdparties/install
110110
RACE_OPT :=
111111
DEBUG_OPT :=
112112
CGO_DEBUG_OPT :=
113+
114+
ifeq ($(MO_CL_CUDA),1)
115+
CUDA_LDFLAGS := -L/usr/local/cuda/lib64/stubs -lcuda -L/usr/local/cuda/lib64 -lcudart -lstdc++
116+
endif
117+
113118
CGO_OPTS :=CGO_CFLAGS="-I$(THIRDPARTIES_INSTALL_DIR)/include"
114-
GOLDFLAGS=-ldflags="-extldflags '-L$(THIRDPARTIES_INSTALL_DIR)/lib -Wl,-rpath,\$${ORIGIN}/lib' $(VERSION_INFO)"
119+
GOLDFLAGS=-ldflags="-extldflags '$(CUDA_LDFLAGS) -L$(THIRDPARTIES_INSTALL_DIR)/lib -Wl,-rpath,\$${ORIGIN}/lib' $(VERSION_INFO)"
115120
TAGS :=
116121

117122
ifeq ("$(UNAME_S)","darwin")

cgo/Makefile

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,32 @@
11
DEBUG_OPT :=
2-
OPT_LV := -O3
2+
3+
# Yeah, fast math. We want it to be fast, for all xcall,
4+
# IEEE compliance should not be an issue.
5+
OPT_LV := -O3 -ffast-math
36
CFLAGS=-std=c99 -g ${OPT_LV} -Wall -Werror
4-
OBJS=mo.o arith.o compare.o logic.o
7+
OBJS=mo.o arith.o compare.o logic.o xcall.o
8+
CUDA_OBJS=
9+
10+
ifeq ($(MO_CL_CUDA),1)
11+
CC = /usr/local/cuda/bin/nvcc
12+
CFLAGS = -ccbin g++ -m64 --shared -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_89,code=sm_89 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_90,code=compute_90
13+
CFLAGS += -DMO_CL_CUDA
14+
CUDA_OBJS += cuda/cuda.o
15+
CUDA_LDFLAGS := -L/usr/local/cuda/lib64/stubs -lcuda -L/usr/local/cuda/lib64 -lcudart -lstdc++
16+
endif
517

618
all: libmo.a
719

8-
libmo.a: $(OBJS)
9-
ar -rcs libmo.a *.o
20+
libmo.a: $(OBJS)
21+
ifeq ($(MO_CL_CUDA),1)
22+
make -C cuda
23+
endif
24+
ar -rcs libmo.a $(OBJS) $(CUDA_OBJS)
25+
26+
#
27+
# $(CC) -o libmo.a $(OBJS) $(CUDA_OBJS) $(CUDA_LDFLAGS)
28+
29+
1030

1131
.PHONY: debug
1232
debug: override OPT_LV := -O0
@@ -15,4 +35,5 @@ debug: all
1535

1636
.PHONY: clean
1737
clean:
18-
rm -f *.o *.a
38+
rm -f *.o *.a *.so
39+
make -C cuda clean

cgo/bitmap.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
#ifndef _BITMAP_H_
1818
#define _BITMAP_H_
1919

20-
#include "mo.h"
2120
#include <string.h>
2221

2322
/*
@@ -46,6 +45,9 @@ static inline uint64_t bitmap_pos2mask(uint64_t pos) {
4645
}
4746

4847
static inline bool bitmap_test(uint64_t *p, uint64_t pos) {
48+
if (p == NULL) {
49+
return false;
50+
}
4951
return (p[bitmap_pos2idx(pos)] & bitmap_pos2mask(pos)) != 0;
5052
}
5153

cgo/cuda/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
mytest
2+
*.fatbin

0 commit comments

Comments
 (0)