Skip to content

Commit e144b45

Browse files
author
lexasub
committed
experiments with websocket-stream
1 parent f0d2bfb commit e144b45

12 files changed

+559
-2
lines changed

CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ option(WHISPER_BUILD_SERVER "whisper: build server example" ${WHISPER_STANDALO
8383
# 3rd party libs
8484
option(WHISPER_CURL "whisper: use libcurl to download model from an URL" OFF)
8585
option(WHISPER_SDL2 "whisper: support for libSDL2" OFF)
86+
option(WEBSOCKET "whisper: support for websocket" OFF)
8687

8788
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
8889
option(WHISPER_FFMPEG "whisper: support building and linking with ffmpeg libs (avcodec, swresample, ...)" OFF)

examples/CMakeLists.txt

+4-2
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,10 @@ if (EMSCRIPTEN)
100100
add_subdirectory(bench.wasm)
101101
elseif(CMAKE_JS_VERSION)
102102
add_subdirectory(addon.node)
103-
else()
104-
add_subdirectory(cli)
103+
else()
104+
if (WEBSOCKET)
105+
add_subdirectory(cli)
106+
endif()
105107
add_subdirectory(bench)
106108
add_subdirectory(server)
107109
add_subdirectory(quantize)
+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
set(TARGET whisper-websocket-stream)
2+
add_executable(${TARGET} main.cpp whisper-server.cpp message-buffer.cpp)
3+
find_package(ixwebsocket)
4+
find_package(CURL REQUIRED)
5+
include(DefaultTargetOptions)
6+
target_link_libraries(${TARGET} PRIVATE common whisper ixwebsocket z CURL::libcurl ${CMAKE_THREAD_LIBS_INIT})
7+
8+
install(TARGETS ${TARGET} RUNTIME)

examples/websocket-stream/README.md

+90
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# WebSocket Whisper Stream Example
2+
3+
This example demonstrates a WebSocket-based real-time audio transcription service using the Whisper model. The server captures audio from clients, processes it using the Whisper model, and sends transcriptions back through WebSocket connections.
4+
5+
## Features
6+
7+
- Real-time audio transcription
8+
- WebSocket communication for audio and transcription data
9+
- Configurable parameters for model, language, and processing settings
10+
- Integration with backend services via HTTP requests
11+
12+
## Usage
13+
14+
Run the server with the following command:
15+
16+
```bash
17+
./build/bin/whisper-websocket-stream -m ./models/ggml-large-v3-turbo.bin -t 8 --host 0.0.0.0 --port 9002 --forward-url http://localhost:8080/completion
18+
```
19+
20+
### Parameters
21+
22+
- `-m` or `--model`: Path to the Whisper model file.
23+
- `-t` or `--threads`: Number of threads for processing.
24+
- `-H` or `--host`: Hostname or IP address to bind the server to.
25+
- `-p` or `--port`: Port number for the server.
26+
- `-f` or `--forward-url`: URL to forward transcriptions to a backend service.
27+
- `-nm` or `--max-messages`: Maximum number of messages before sending to the backend.
28+
- `-l` or `--language`: Spoken language for transcription.
29+
- `-vth` or `--vad-thold`: Voice activity detection threshold.
30+
- `-tr` or `--translate`: Enable translation to English.
31+
- `-ng` or `--no-gpu`: Disable GPU usage.
32+
- `-bs` or `--beam-size`: Beam size for beam search.
33+
34+
## Building
35+
36+
To build the server, follow these steps:
37+
38+
```bash
39+
# Install dependencies
40+
git clone --depth 1 https://github.com/machinezone/IXWebSocket/
41+
cd IXWebSocket
42+
mkdir -p build && cd build && cmake -GNinja .. && sudo ninja -j$((npoc)) install
43+
# Build the project
44+
#cuda is optional
45+
git clone --depth 1 https://github.com/ggml-org/whisper.cpp
46+
cd whisper.cpp
47+
mkdir -p build && cd build
48+
cmake -GNinja -DCMAKE_BUILD_TYPE=Release -DWEBSOCKET=ON -DGGML_CUDA ..
49+
ninja -j$((npoc))
50+
51+
# Run the server
52+
./bin/whisper-websocket-stream --help
53+
```
54+
55+
## Client Integration
56+
57+
Clients can connect to the WebSocket server and send audio data. The server processes the audio and sends transcriptions back through the WebSocket connection.
58+
59+
### Example Client Code (JavaScript)
60+
61+
```javascript
62+
const socket = new WebSocket('ws://localhost:9002');
63+
64+
socket.onopen = () => {
65+
console.log('Connected to WebSocket server');
66+
};
67+
68+
socket.onmessage = (event) => {
69+
console.log('Transcription:', event.data);
70+
};
71+
72+
socket.onclose = () => {
73+
console.log('Disconnected from WebSocket server');
74+
};
75+
76+
// Function to send audio data to the server
77+
function sendAudioData(audioData) {
78+
socket.send(audioData);
79+
}
80+
```
81+
82+
## Backend Integration
83+
84+
The server can forward transcriptions to a backend service via HTTP requests. Configure the `forward_url` parameter to specify the backend service URL.
85+
86+
## Dependencies
87+
- whisper.cpp
88+
- ixwebsocket for WebSocket communication
89+
- libcurl for HTTP requests
90+
```
+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#ifndef CLIENT_SESSION_H
2+
#define CLIENT_SESSION_H
3+
#include <vector>
4+
#include <mutex>
5+
#include <atomic>
6+
#include "ixwebsocket/IXWebSocketServer.h"
7+
#include "message-buffer.h"
8+
struct ClientSession {
9+
std::vector<float> pcm_buffer;
10+
std::mutex mtx;
11+
std::atomic<bool> active{true};
12+
ix::WebSocket *connection;
13+
MessageBuffer buffToBackend;
14+
};
15+
#endif

examples/websocket-stream/index.html

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
<!DOCTYPE html>
2+
<html>
3+
<head>
4+
<title>Mic to WebSocket</title>
5+
</head>
6+
<body>
7+
<button id="startBtn">Start Mic</button>
8+
<div id="status"></div>
9+
10+
<script>
11+
const startBtn = document.getElementById('startBtn');
12+
const statusDiv = document.getElementById('status');
13+
let isRecording = false;
14+
let socket;
15+
16+
startBtn.addEventListener('click', async () => {
17+
if (!isRecording) {
18+
try {
19+
socket = new WebSocket('ws://192.168.2.109:9002');
20+
21+
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
22+
const audioContext = new AudioContext({sampleRate: 16000});
23+
const source = audioContext.createMediaStreamSource(stream);
24+
25+
const processor = audioContext.createScriptProcessor(1024, 1, 1);
26+
27+
source.connect(processor);
28+
processor.connect(audioContext.destination);
29+
function floatTo16BitPCM(input) {
30+
const output = new Int16Array(input.length);
31+
for (let i = 0; i < input.length; i++) {
32+
output[i] = Math.max(-1, Math.min(1, input[i])) * 0x7FFF;
33+
}
34+
return output;
35+
}
36+
processor.onaudioprocess = (e) => {
37+
const input = e.inputBuffer.getChannelData(0);
38+
const int16Data = floatTo16BitPCM(input);
39+
40+
if (socket.readyState === WebSocket.OPEN) {
41+
socket.send(int16Data.buffer);
42+
}
43+
};
44+
45+
statusDiv.textContent = 'Recording...';
46+
startBtn.textContent = 'Stop';
47+
isRecording = true;
48+
} catch (err) {
49+
console.error('Error accessing microphone:', err);
50+
statusDiv.textContent = 'Error accessing microphone';
51+
}
52+
} else {
53+
if (socket) socket.close();
54+
statusDiv.textContent = 'Stopped';
55+
startBtn.textContent = 'Start Mic';
56+
isRecording = false;
57+
}
58+
});
59+
</script>
60+
</body>
61+
</html>

examples/websocket-stream/main.cpp

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#include <string>
2+
#include "whisper.h"
3+
#include "server-params.h"
4+
#include "whisper-server.h"
5+
6+
#define CONVERT_FROM_PCM_16
7+
std::string forward_url = "http://127.0.0.1:8080/completion";
8+
size_t max_messages = 1000;
9+
10+
void print_usage(int argc, char** argv, const ServerParams& params) {
11+
fprintf(stderr, "\n");
12+
fprintf(stderr, "usage: %s [options]\n", argv[0]);
13+
fprintf(stderr, "\n");
14+
fprintf(stderr, "options:\n");
15+
fprintf(stderr, " -h, --help show this help message and exit\n");
16+
fprintf(stderr, " -H HOST, --host HOST [%-7s] hostname or ip\n", params.host.c_str());
17+
fprintf(stderr, " -p PORT, --port PORT [%-7d] server port\n", params.port);
18+
fprintf(stderr, " -f FORWARD_URL, --forward-url FORWARD_URL [%-7s] forward url\n", forward_url.c_str());
19+
fprintf(stderr, " -t N, --threads N [%-7d] number of threads\n", params.n_threads);
20+
fprintf(stderr, " -nm max_messages, --max-messages max_messages [%-7d] max messages before send to backend\n", max_messages);
21+
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
22+
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
23+
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity threshold\n", params.vad_thold);
24+
fprintf(stderr, " -tr, --translate [%-7s] translate to english\n", params.translate ? "true" : "false");
25+
fprintf(stderr, " -ng, --no-gpu [%-7s] disable GPU\n", params.use_gpu ? "false" : "true");
26+
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
27+
fprintf(stderr, "\n");
28+
}
29+
30+
bool parse_params(int argc, char** argv, ServerParams& params) {
31+
for (int i = 1; i < argc; i++) {
32+
std::string arg = argv[i];
33+
34+
if (arg == "-h" || arg == "--help") {
35+
print_usage(argc, argv, params);
36+
exit(0);
37+
}
38+
else if (arg == "-H" || arg == "--host") { params.host = argv[++i]; }
39+
else if (arg == "-p" || arg == "--port") { params.port = std::stoi(argv[++i]); }
40+
else if (arg == "-f" || arg == "--forward-url") { forward_url = argv[++i]; }
41+
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
42+
else if (arg == "-nm" || arg == "--max-messages") { max_messages = std::stoi(argv[++i]); }
43+
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
44+
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
45+
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
46+
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
47+
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
48+
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
49+
else {
50+
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
51+
print_usage(argc, argv, params);
52+
return false;
53+
}
54+
}
55+
return true;
56+
}
57+
58+
int main(int argc, char** argv) {
59+
ServerParams params;
60+
if (!parse_params(argc, argv, params)) {
61+
return 1;
62+
}
63+
if (params.port < 1 || params.port > 65535) {
64+
throw std::invalid_argument("Invalid port number");
65+
}
66+
if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
67+
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
68+
return 1;
69+
}
70+
71+
WhisperServer server(params);
72+
server.run();
73+
return 0;
74+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#include <sstream>
2+
#include <mutex>
3+
#include <curl/curl.h>
4+
5+
#include "message-buffer.h"
6+
extern std::string forward_url;
7+
extern size_t max_messages;
8+
namespace {
9+
std::stringstream ss;
10+
std::mutex mtx;
11+
size_t current_count = 0;
12+
static size_t write_callback(char* ptr, size_t size, size_t nmemb, void* userdata) {
13+
((std::string*)userdata)->append(ptr, size * nmemb);
14+
return size * nmemb;
15+
}
16+
}
17+
18+
void MessageBuffer::add_message(const char* msg) {
19+
std::lock_guard<std::mutex> lock(mtx);
20+
ss << std::string(msg) << '\n';
21+
if (++current_count >= max_messages) {
22+
flush();
23+
}
24+
}
25+
26+
std::string MessageBuffer::get_payload() {
27+
std::lock_guard<std::mutex> lock(mtx);
28+
return ss.str();
29+
}
30+
31+
void MessageBuffer::flush() {
32+
std::string payload = get_payload();
33+
if (!payload.empty()) {
34+
send_via_http(payload);
35+
ss.str(""); //clear string stream
36+
current_count = 0;
37+
}
38+
}
39+
40+
void MessageBuffer::send_via_http(const std::string& data) {
41+
CURL* curl = curl_easy_init();
42+
if (!curl) {
43+
printf("CURL init failed");
44+
return;
45+
}
46+
47+
//make headers
48+
struct curl_slist* headers = NULL;
49+
headers = curl_slist_append(headers, "Content-Type: text/plain");
50+
std::string cid_header = "X-Connection-ID: " + connection_id;
51+
headers = curl_slist_append(headers, cid_header.c_str());
52+
53+
//config curl
54+
std::string response;
55+
printf("sending to %s\n", forward_url.c_str());
56+
curl_easy_setopt(curl, CURLOPT_URL, forward_url.c_str());
57+
curl_easy_setopt(curl, CURLOPT_POST, 1L);
58+
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
59+
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, data.c_str());
60+
curl_easy_setopt(curl, CURLOPT_POSTFIELDSIZE, data.size());
61+
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback);
62+
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
63+
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 5L);
64+
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 2L);
65+
66+
//run curl
67+
for (int retry = 0; retry < 3; ++retry) {
68+
CURLcode res = curl_easy_perform(curl);
69+
if (res == CURLE_OK) {
70+
printf("[Response (%s): %s\n", connection_id.c_str(), response.c_str());
71+
break;
72+
}
73+
printf("[CURL error: %s\n", curl_easy_strerror(res));
74+
}
75+
76+
//clean
77+
curl_slist_free_all(headers);
78+
curl_easy_cleanup(curl);
79+
}
+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#ifndef MESSAGE_BUFFER_H
2+
#define MESSAGE_BUFFER_H
3+
class MessageBuffer {
4+
public:
5+
std::string connection_id;
6+
void add_message(const char* msg);
7+
8+
std::string get_payload();
9+
10+
void flush();
11+
12+
void send_via_http(const std::string& data);
13+
};
14+
#endif
+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#ifndef SERVER_PARAMS_H
2+
#define SERVER_PARAMS_H
3+
#include <thread>
4+
struct ServerParams {
5+
int32_t port = 9002;
6+
int32_t n_threads = std::min(4, (int32_t)std::thread::hardware_concurrency());
7+
int32_t audio_ctx = 0;
8+
int32_t beam_size = -1;
9+
10+
float vad_thold = 0.6f;
11+
12+
bool translate = false;
13+
bool print_special = false;
14+
bool no_timestamps = true;
15+
bool tinydiarize = false;
16+
bool use_gpu = true;
17+
bool flash_attn = true;
18+
19+
std::string language = "en";
20+
std::string model = "ggml-large-v3-turbo.bin";
21+
std::string host = "0.0.0.0";
22+
};
23+
#endif

0 commit comments

Comments
 (0)