Skip to content

Commit ed26da3

Browse files
committed
New features initial readme
1 parent d67dc66 commit ed26da3

File tree

3 files changed

+99
-8
lines changed

3 files changed

+99
-8
lines changed

Readme.md

+10
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,13 @@ node index.js
7474
```
7575
## Test the app
7676
With the development server running, call the phone number you purchases in the **Prerequisites**. After the introduction, you should be able to talk to the AI Assistant. Have fun!
77+
78+
## Special features
79+
80+
### Have the AI speak first
81+
To have the AI voice assistant talk before the user, uncomment the line `// sendInitialConversationItem();`. The initial greeting is controlled in `sendInitialConversationItem`.
82+
83+
### Interrupt handling/AI preemption
84+
When the user speaks and OpenAI sends `input_audio_buffer.speech_started`, the code will clear the Twilio Media Streams buffer and send OpenAI `conversation.item.truncate`.
85+
86+
Depending on your application's needs, you may want to use the [`input_audio_buffer.speech_stopped`](https://platform.openai.com/docs/api-reference/realtime-server-events/input-audio-buffer-speech-stopped) event, instead.

index.js

+88-7
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
11
import Fastify from 'fastify';
22
import WebSocket from 'ws';
3-
import fs from 'fs';
43
import dotenv from 'dotenv';
54
import fastifyFormBody from '@fastify/formbody';
65
import fastifyWs from '@fastify/websocket';
76

87
// Load environment variables from .env file
98
dotenv.config();
109

11-
// Retrieve the OpenAI API key from environment variables. You must have OpenAI Realtime API access.
10+
// Retrieve the OpenAI API key from environment variables.
1211
const { OPENAI_API_KEY } = process.env;
1312

1413
if (!OPENAI_API_KEY) {
@@ -26,7 +25,7 @@ const SYSTEM_MESSAGE = 'You are a helpful and bubbly AI assistant who loves to c
2625
const VOICE = 'alloy';
2726
const PORT = process.env.PORT || 5050; // Allow dynamic port assignment
2827

29-
// List of Event Types to log to the console. See OpenAI Realtime API Documentation. (session.updated is handled separately.)
28+
// List of Event Types to log to the console. See the OpenAI Realtime API Documentation: https://platform.openai.com/docs/api-reference/realtime
3029
const LOG_EVENT_TYPES = [
3130
'response.content.done',
3231
'rate_limits.updated',
@@ -37,6 +36,11 @@ const LOG_EVENT_TYPES = [
3736
'session.created'
3837
];
3938

39+
// Track drift between OpenAI and system clocks, and the assistant's last Item ID
40+
let localStartTime;
41+
let lastDrift = null;
42+
let lastAssistantItem;
43+
4044
// Root Route
4145
fastify.get('/', async (request, reply) => {
4246
reply.send({ message: 'Twilio Media Stream Server is running!' });
@@ -63,7 +67,6 @@ fastify.register(async (fastify) => {
6367
fastify.get('/media-stream', { websocket: true }, (connection, req) => {
6468
console.log('Client connected');
6569

66-
6770
const openAiWs = new WebSocket('wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01', {
6871
headers: {
6972
Authorization: `Bearer ${OPENAI_API_KEY}`,
@@ -73,7 +76,7 @@ fastify.register(async (fastify) => {
7376

7477
let streamSid = null;
7578

76-
const sendSessionUpdate = () => {
79+
const initializeSession = () => {
7780
const sessionUpdate = {
7881
type: 'session.update',
7982
session: {
@@ -89,14 +92,39 @@ fastify.register(async (fastify) => {
8992

9093
console.log('Sending session update:', JSON.stringify(sessionUpdate));
9194
openAiWs.send(JSON.stringify(sessionUpdate));
95+
96+
// Uncomment the following line to have AI speak first:
97+
// sendInitialConversationItem();
98+
};
99+
100+
const sendInitialConversationItem = () => {
101+
const initialConversationItem = {
102+
type: 'conversation.item.create',
103+
item: {
104+
type: 'message',
105+
role: 'user',
106+
content: [
107+
{
108+
type: 'input_text',
109+
text: 'Greet the user with "Hello there! I am an AI voice assistant powered by Twilio and the OpenAI Realtime API. You can ask me for facts, jokes, or anything you can imagine. How can I help you?"'
110+
}
111+
]
112+
}
113+
};
114+
115+
console.log('Sending initial conversation item:', JSON.stringify(initialConversationItem));
116+
openAiWs.send(JSON.stringify(initialConversationItem));
117+
openAiWs.send(JSON.stringify({ type: 'response.create' }));
92118
};
93119

94120
// Open event for OpenAI WebSocket
95121
openAiWs.on('open', () => {
122+
localStartTime = Date.now(); // Start local timer
96123
console.log('Connected to the OpenAI Realtime API');
97-
setTimeout(sendSessionUpdate, 250); // Ensure connection stability, send after .25 seconds
124+
setTimeout(initializeSession, 100);
98125
});
99126

127+
100128
// Listen for messages from the OpenAI WebSocket (and send to Twilio if necessary)
101129
openAiWs.on('message', (data) => {
102130
try {
@@ -118,11 +146,65 @@ fastify.register(async (fastify) => {
118146
};
119147
connection.send(JSON.stringify(audioDelta));
120148
}
149+
150+
// We can get the following event while Twilio is still playing audio from the AI
151+
if (response.type === 'input_audio_buffer.speech_started') {
152+
handleSpeechStartedEvent(response);
153+
}
154+
155+
if (response.type === 'response.done') {
156+
handleResponseDoneEvent(response);
157+
}
121158
} catch (error) {
122159
console.error('Error processing OpenAI message:', error, 'Raw message:', data);
123160
}
124161
});
125162

163+
// Interruption handling
164+
const handleSpeechStartedEvent = (response) => {
165+
const localTime = Date.now();
166+
const drift = localTime - localStartTime - response.audio_start_ms;
167+
168+
console.log('OpenAI Speech started at', response.audio_start_ms, 'ms from OpenAI perspective');
169+
console.log('Local time at speech start:', localTime - localStartTime, 'ms');
170+
console.log('Time drift (OpenAI - Local):', drift, 'ms');
171+
172+
if (lastDrift === null || drift !== lastDrift) {
173+
console.log('Drift has changed. Previous:', lastDrift, 'Current:', drift);
174+
lastDrift = drift;
175+
}
176+
177+
if (streamSid) {
178+
connection.send(JSON.stringify({
179+
event: 'clear',
180+
streamSid: streamSid
181+
}));
182+
}
183+
184+
if (lastAssistantItem) {
185+
const truncateEvent = {
186+
type: 'conversation.item.truncate',
187+
item_id: lastAssistantItem,
188+
content_index: 0,
189+
audio_end_ms: response.audio_start_ms
190+
};
191+
console.log('Sending truncation event:', JSON.stringify(truncateEvent));
192+
openAiWs.send(JSON.stringify(truncateEvent));
193+
lastAssistantItem = null;
194+
}
195+
};
196+
197+
// Interruption handling requires knowing the preempted conversation's ID
198+
const handleResponseDoneEvent = (response) => {
199+
const outputItems = response.response.output;
200+
for (const item of outputItems) {
201+
if (item.role === 'assistant') {
202+
lastAssistantItem = item.id;
203+
break; // Consider the first relevant assistant item
204+
}
205+
}
206+
};
207+
126208
// Handle incoming messages from Twilio
127209
connection.on('message', (message) => {
128210
try {
@@ -135,7 +217,6 @@ fastify.register(async (fastify) => {
135217
type: 'input_audio_buffer.append',
136218
audio: data.media.payload
137219
};
138-
139220
openAiWs.send(JSON.stringify(audioAppend));
140221
}
141222
break;

package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "speech-assistant-openai-realtime-api-node",
3-
"version": "1.0.0",
3+
"version": "1.0.1",
44
"description": "",
55
"main": "index.js",
66
"scripts": {

0 commit comments

Comments
 (0)