Skip to content

Commit e9989b5

Browse files
authored
Dynamic memory allocation. Drop Baichuan/InternLM support in favor of llama.cpp. (#305)
1 parent a0f2d4a commit e9989b5

25 files changed

+1256
-2485
lines changed

Diff for: .github/workflows/cmake.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ jobs:
1818
runs-on: ${{ matrix.os }}
1919
strategy:
2020
matrix:
21-
os: [ubuntu-latest, windows-latest, macos-13]
21+
os: [ubuntu-latest, windows-latest, macos-13, macos-14]
2222

2323
steps:
2424
- uses: actions/checkout@v3

Diff for: CMakeLists.txt

+15-17
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin CACHE STRING "")
88
set(CMAKE_CXX_STANDARD 17)
99

1010
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall")
11+
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-expansion-to-defined") # suppress ggml warnings
1112

1213
if (NOT CMAKE_BUILD_TYPE)
1314
set(CMAKE_BUILD_TYPE Release)
@@ -24,17 +25,8 @@ if (CHATGLM_ENABLE_PYBIND)
2425
endif ()
2526

2627
# third-party libraries
27-
add_compile_definitions(GGML_CUDA_MMV_Y=4) # for large vocab
28-
include_directories(third_party/ggml/include/ggml third_party/ggml/src)
29-
add_subdirectory(third_party/ggml)
30-
31-
set(SPM_ENABLE_SHARED OFF CACHE BOOL "chatglm: disable sentencepiece shared libraries by default")
32-
set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "chatglm: disable tcmalloc by default")
33-
include_directories(third_party/sentencepiece/src)
34-
add_subdirectory(third_party/sentencepiece)
35-
36-
if (GGML_CUBLAS)
37-
add_compile_definitions(GGML_USE_CUBLAS)
28+
if (GGML_CUDA)
29+
add_compile_definitions(GGML_USE_CUDA)
3830
enable_language(CUDA)
3931
# ref: https://stackoverflow.com/questions/28932864/which-compute-capability-is-supported-by-which-cuda-versions
4032
set(CUDA_ARCH_LIST "52;61;70;75")
@@ -47,10 +39,17 @@ if (GGML_CUBLAS)
4739
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8")
4840
set(CUDA_ARCH_LIST "${CUDA_ARCH_LIST};89;90")
4941
endif ()
50-
set(CUDA_ARCHITECTURES ${CUDA_ARCH_LIST} CACHE STRING "chatglm: cuda architectures to compile")
51-
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES ${CUDA_ARCHITECTURES})
42+
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH_LIST} CACHE STRING "")
5243
endif ()
5344

45+
include_directories(third_party/ggml/include/ggml third_party/ggml/src)
46+
add_subdirectory(third_party/ggml)
47+
48+
set(SPM_ENABLE_SHARED OFF CACHE BOOL "chatglm: disable sentencepiece shared libraries by default")
49+
set(SPM_ENABLE_TCMALLOC OFF CACHE BOOL "chatglm: disable tcmalloc by default")
50+
include_directories(third_party/sentencepiece/src)
51+
add_subdirectory(third_party/sentencepiece)
52+
5453
include_directories(third_party/sentencepiece/third_party/protobuf-lite)
5554

5655
set(ABSL_ENABLE_INSTALL ON CACHE BOOL "" FORCE)
@@ -72,9 +71,8 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
7271

7372
file(GLOB CPP_SOURCES
7473
${PROJECT_SOURCE_DIR}/*.h
75-
${PROJECT_SOURCE_DIR}/*.cpp)
76-
77-
set_source_files_properties(${CPP_SOURCES} PROPERTIES COMPILE_FLAGS "-pedantic-errors")
74+
${PROJECT_SOURCE_DIR}/*.cpp
75+
${PROJECT_SOURCE_DIR}/tests/*.cpp)
7876

7977
add_library(chatglm STATIC chatglm.cpp)
8078
target_link_libraries(chatglm PUBLIC ggml sentencepiece-static re2)
@@ -137,7 +135,7 @@ add_custom_target(check-all
137135
COMMAND cmake --build build -j
138136
COMMAND ./build/bin/chatglm_test
139137
COMMAND python3 setup.py develop
140-
COMMAND python3 -m pytest tests/test_chatglm_cpp.py -v
138+
COMMAND python3 -m pytest tests/test_chatglm_cpp.py
141139
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
142140
)
143141

Diff for: Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ ARG BASE_IMAGE=ubuntu:20.04
22

33
FROM ${BASE_IMAGE} AS build
44

5-
ARG CMAKE_ARGS="-DGGML_CUBLAS=OFF"
5+
ARG CMAKE_ARGS="-DGGML_CUDA=OFF"
66

77
WORKDIR /chatglm.cpp
88

Diff for: README.md

+17-148
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
![Python](https://img.shields.io/pypi/pyversions/chatglm-cpp)
77
[![License: MIT](https://img.shields.io/badge/license-MIT-blue)](LICENSE)
88

9-
C++ implementation of [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B), [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), [ChatGLM3](https://github.com/THUDM/ChatGLM3), [GLM-4](https://github.com/THUDM/GLM-4) and more LLMs for real-time chatting on your MacBook.
9+
C++ implementation of [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B), [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), [ChatGLM3](https://github.com/THUDM/ChatGLM3) and [GLM-4](https://github.com/THUDM/GLM-4) for real-time chatting on your MacBook.
1010

1111
![demo](docs/demo.gif)
1212

@@ -22,9 +22,7 @@ Highlights:
2222
Support Matrix:
2323
* Hardwares: x86/arm CPU, NVIDIA GPU, Apple Silicon GPU
2424
* Platforms: Linux, MacOS, Windows
25-
* Models: [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B), [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), [ChatGLM3](https://github.com/THUDM/ChatGLM3), [GLM-4](https://github.com/THUDM/GLM-4), [CodeGeeX2](https://github.com/THUDM/CodeGeeX2), [Baichuan-13B](https://github.com/baichuan-inc/Baichuan-13B), [Baichuan-7B](https://github.com/baichuan-inc/Baichuan-7B), [Baichuan-13B](https://github.com/baichuan-inc/Baichuan-13B), [Baichuan2](https://github.com/baichuan-inc/Baichuan2), [InternLM](https://github.com/InternLM/InternLM)
26-
27-
**NOTE**: Baichuan & InternLM model series are deprecated in favor of [llama.cpp](https://github.com/ggerganov/llama.cpp).
25+
* Models: [ChatGLM-6B](https://github.com/THUDM/ChatGLM-6B), [ChatGLM2-6B](https://github.com/THUDM/ChatGLM2-6B), [ChatGLM3](https://github.com/THUDM/ChatGLM3), [GLM-4](https://github.com/THUDM/GLM-4), [CodeGeeX2](https://github.com/THUDM/CodeGeeX2)
2826

2927
## Getting Started
3028

@@ -59,7 +57,6 @@ The original model (`-i <model_name_or_path>`) can be a Hugging Face model name
5957
* ChatGLM3-6B: `THUDM/chatglm3-6b`
6058
* ChatGLM4-9B: `THUDM/glm-4-9b-chat`
6159
* CodeGeeX2: `THUDM/codegeex2-6b`, `THUDM/codegeex2-6b-int4`
62-
* Baichuan & Baichuan2: `baichuan-inc/Baichuan-13B-Chat`, `baichuan-inc/Baichuan2-7B-Chat`, `baichuan-inc/Baichuan2-13B-Chat`
6360

6461
You are free to try any of the below quantization types by specifying `-t <type>`:
6562
* `q4_0`: 4-bit integer quantization with fp16 scales.
@@ -212,56 +209,6 @@ print(bubble_sort([5, 4, 3, 2, 1]))
212209
```
213210
</details>
214211
215-
<details>
216-
<summary>Baichuan-13B-Chat</summary>
217-
218-
```sh
219-
python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan-13B-Chat -t q4_0 -o models/baichuan-13b-chat-ggml.bin
220-
./build/bin/main -m models/baichuan-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.1
221-
# 你好!有什么我可以帮助你的吗?
222-
```
223-
</details>
224-
225-
<details>
226-
<summary>Baichuan2-7B-Chat</summary>
227-
228-
```sh
229-
python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan2-7B-Chat -t q4_0 -o models/baichuan2-7b-chat-ggml.bin
230-
./build/bin/main -m models/baichuan2-7b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05
231-
# 你好!很高兴为您提供帮助。请问有什么问题我可以帮您解答?
232-
```
233-
</details>
234-
235-
<details>
236-
<summary>Baichuan2-13B-Chat</summary>
237-
238-
```sh
239-
python3 chatglm_cpp/convert.py -i baichuan-inc/Baichuan2-13B-Chat -t q4_0 -o models/baichuan2-13b-chat-ggml.bin
240-
./build/bin/main -m models/baichuan2-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05
241-
# 你好!今天我能为您提供什么帮助?
242-
```
243-
</details>
244-
245-
<details>
246-
<summary>InternLM-Chat-7B</summary>
247-
248-
```sh
249-
python3 chatglm_cpp/convert.py -i internlm/internlm-chat-7b -t q4_0 -o models/internlm-chat-7b-ggml.bin
250-
./build/bin/main -m models/internlm-chat-7b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8
251-
# 你好,我是书生·浦语,有什么可以帮助你的吗?
252-
```
253-
</details>
254-
255-
<details>
256-
<summary>InternLM-Chat-20B</summary>
257-
258-
```sh
259-
python3 chatglm_cpp/convert.py -i internlm/internlm-chat-20b -t q4_0 -o models/internlm-chat-20b-ggml.bin
260-
./build/bin/main -m models/internlm-chat-20b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8
261-
# 你好!有什么我可以帮到你的吗?
262-
```
263-
</details>
264-
265212
## Using BLAS
266213
267214
BLAS library can be integrated to further accelerate matrix multiplication. However, in some cases, using BLAS may cause performance degradation. Whether to turn on BLAS should depend on the benchmarking result.
@@ -277,17 +224,17 @@ OpenBLAS provides acceleration on CPU. Add the CMake flag `-DGGML_OPENBLAS=ON` t
277224
cmake -B build -DGGML_OPENBLAS=ON && cmake --build build -j
278225
```
279226
280-
**cuBLAS**
227+
**CUDA**
281228
282-
cuBLAS uses NVIDIA GPU to accelerate BLAS. Add the CMake flag `-DGGML_CUBLAS=ON` to enable it.
229+
CUDA accelerates model inference on NVIDIA GPU. Add the CMake flag `-DGGML_CUDA=ON` to enable it.
283230
```sh
284-
cmake -B build -DGGML_CUBLAS=ON && cmake --build build -j
231+
cmake -B build -DGGML_CUDA=ON && cmake --build build -j
285232
```
286233
287-
By default, all kernels will be compiled for all possible CUDA architectures and it takes some time. To run on a specific type of device, you may specify `CUDA_ARCHITECTURES` to speed up the nvcc compilation. For example:
234+
By default, all kernels will be compiled for all possible CUDA architectures and it takes some time. To run on a specific type of device, you may specify `CMAKE_CUDA_ARCHITECTURES` to speed up the nvcc compilation. For example:
288235
```sh
289-
cmake -B build -DGGML_CUBLAS=ON -DCUDA_ARCHITECTURES="80" # for A100
290-
cmake -B build -DGGML_CUBLAS=ON -DCUDA_ARCHITECTURES="70;75" # compatible with both V100 and T4
236+
cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="80" # for A100
237+
cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="70;75" # compatible with both V100 and T4
291238
```
292239
293240
To find out the CUDA architecture of your GPU device, see [Your GPU Compute Capability](https://developer.nvidia.com/cuda-gpus).
@@ -310,9 +257,9 @@ Install from PyPI (recommended): will trigger compilation on your platform.
310257
pip install -U chatglm-cpp
311258
```
312259
313-
To enable cuBLAS acceleration on NVIDIA GPU:
260+
To enable CUDA on NVIDIA GPU:
314261
```sh
315-
CMAKE_ARGS="-DGGML_CUBLAS=ON" pip install -U chatglm-cpp
262+
CMAKE_ARGS="-DGGML_CUDA=ON" pip install -U chatglm-cpp
316263
```
317264
318265
To enable Metal on Apple silicon devices:
@@ -426,51 +373,6 @@ python3 web_demo.py -m ../models/codegeex2-ggml.bin --temp 0 --max_length 512 --
426373
```
427374
</details>
428375
429-
<details>
430-
<summary>Baichuan-13B-Chat</summary>
431-
432-
```sh
433-
python3 cli_demo.py -m ../models/baichuan-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.1 # CLI demo
434-
python3 web_demo.py -m ../models/baichuan-13b-chat-ggml.bin --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.1 # web demo
435-
```
436-
</details>
437-
438-
<details>
439-
<summary>Baichuan2-7B-Chat</summary>
440-
441-
```sh
442-
python3 cli_demo.py -m ../models/baichuan2-7b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # CLI demo
443-
python3 web_demo.py -m ../models/baichuan2-7b-chat-ggml.bin --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # web demo
444-
```
445-
</details>
446-
447-
<details>
448-
<summary>Baichuan2-13B-Chat</summary>
449-
450-
```sh
451-
python3 cli_demo.py -m ../models/baichuan2-13b-chat-ggml.bin -p 你好 --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # CLI demo
452-
python3 web_demo.py -m ../models/baichuan2-13b-chat-ggml.bin --top_k 5 --top_p 0.85 --temp 0.3 --repeat_penalty 1.05 # web demo
453-
```
454-
</details>
455-
456-
<details>
457-
<summary>InternLM-Chat-7B</summary>
458-
459-
```sh
460-
python3 cli_demo.py -m ../models/internlm-chat-7b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 # CLI demo
461-
python3 web_demo.py -m ../models/internlm-chat-7b-ggml.bin --top_p 0.8 --temp 0.8 # web demo
462-
```
463-
</details>
464-
465-
<details>
466-
<summary>InternLM-Chat-20B</summary>
467-
468-
```sh
469-
python3 cli_demo.py -m ../models/internlm-chat-20b-ggml.bin -p 你好 --top_p 0.8 --temp 0.8 # CLI demo
470-
python3 web_demo.py -m ../models/internlm-chat-20b-ggml.bin --top_p 0.8 --temp 0.8 # web demo
471-
```
472-
</details>
473-
474376
**Converting Hugging Face LLMs at Runtime**
475377
476378
Sometimes it might be inconvenient to convert and save the intermediate GGML models beforehand. Here is an option to directly load from the original Hugging Face model, quantize it into GGML models in a minute, and start serving. All you need is to replace the GGML model path with the Hugging Face model name or path.
@@ -579,7 +481,7 @@ For CUDA support, make sure [nvidia-docker](https://github.com/NVIDIA/nvidia-doc
579481
```sh
580482
docker build . --network=host -t chatglm.cpp-cuda \
581483
--build-arg BASE_IMAGE=nvidia/cuda:12.2.0-devel-ubuntu20.04 \
582-
--build-arg CMAKE_ARGS="-DGGML_CUBLAS=ON -DCUDA_ARCHITECTURES=80"
484+
--build-arg CMAKE_ARGS="-DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=80"
583485
docker run -it --rm --gpus all -v $PWD/models:/chatglm.cpp/models chatglm.cpp-cuda \
584486
./build/bin/main -m models/chatglm-ggml.bin -p "你好"
585487
```
@@ -631,45 +533,12 @@ ChatGLM2-6B / ChatGLM3-6B / CodeGeeX2:
631533
632534
ChatGLM4-9B:
633535
634-
| | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F16 |
635-
|--------------------------------|-------|-------|-------|-------|-------|-------|
636-
| ms/token (CPU @ Platinum 8260) | 105 | 105 | 122 | 134 | 158 | 279 |
637-
| ms/token (CUDA @ V100 SXM2) | 12.1 | 12.5 | 13.8 | 13.9 | 17.7 | 27.7 |
638-
| file size | 5.0G | 5.5G | 6.1G | 6.6G | 9.4G | 18G |
639-
640-
Baichuan-7B / Baichuan2-7B:
641-
642-
| | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F16 |
643-
|--------------------------------|-------|-------|-------|-------|-------|-------|
644-
| ms/token (CPU @ Platinum 8260) | 85.3 | 94.8 | 103.4 | 109.6 | 136.8 | 248.5 |
645-
| ms/token (CUDA @ V100 SXM2) | 8.7 | 9.2 | 10.2 | 10.3 | 13.2 | 21.0 |
646-
| ms/token (MPS @ M2 Ultra) | 11.3 | 12.0 | N/A | N/A | 16.4 | 25.6 |
647-
| file size | 4.0G | 4.4G | 4.9G | 5.3G | 7.5G | 14G |
648-
| mem usage | 4.5G | 4.9G | 5.3G | 5.7G | 7.8G | 14G |
649-
650-
Baichuan-13B / Baichuan2-13B:
651-
652-
| | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F16 |
653-
|--------------------------------|-------|-------|-------|-------|-------|-------|
654-
| ms/token (CPU @ Platinum 8260) | 161.7 | 175.8 | 189.9 | 192.3 | 255.6 | 459.6 |
655-
| ms/token (CUDA @ V100 SXM2) | 13.7 | 15.1 | 16.3 | 16.9 | 21.9 | 36.8 |
656-
| ms/token (MPS @ M2 Ultra) | 18.2 | 18.8 | N/A | N/A | 27.2 | 44.4 |
657-
| file size | 7.0G | 7.8G | 8.5G | 9.3G | 14G | 25G |
658-
| mem usage | 7.8G | 8.8G | 9.5G | 10G | 14G | 25G |
659-
660-
InternLM-7B:
661-
662-
| | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F16 |
663-
|--------------------------------|-------|-------|-------|-------|-------|-------|
664-
| ms/token (CPU @ Platinum 8260) | 85.3 | 90.1 | 103.5 | 112.5 | 137.3 | 232.2 |
665-
| ms/token (CUDA @ V100 SXM2) | 9.1 | 9.4 | 10.5 | 10.5 | 13.3 | 21.1 |
666-
667-
InternLM-20B:
668-
669-
| | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F16 |
670-
|--------------------------------|-------|-------|-------|-------|-------|-------|
671-
| ms/token (CPU @ Platinum 8260) | 230.0 | 236.7 | 276.6 | 290.6 | 357.1 | N/A |
672-
| ms/token (CUDA @ V100 SXM2) | 21.6 | 23.2 | 25.0 | 25.9 | 33.4 | N/A |
536+
| | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | F16 |
537+
|--------------------------------|------|------|------|------|------|------|
538+
| ms/token (CPU @ Platinum 8260) | 105 | 105 | 122 | 134 | 158 | 279 |
539+
| ms/token (CUDA @ V100 SXM2) | 12.1 | 12.5 | 13.8 | 13.9 | 17.7 | 27.7 |
540+
| ms/token (MPS @ M2 Ultra) | 14.4 | 15.3 | 19.6 | 20.1 | 20.7 | 32.4 |
541+
| file size | 5.0G | 5.5G | 6.1G | 6.6G | 9.4G | 18G |
673542
674543
## Model Quality
675544

0 commit comments

Comments
 (0)