New features initial readme

pkamp3 · pkamp3 · commit ed26da35f5fc · 2024-10-08T17:17:15.000-04:00
diff --git a/Readme.md b/Readme.md
@@ -74,3 +74,13 @@ node index.js
 ```
 ## Test the app
 With the development server running, call the phone number you purchases in the **Prerequisites**. After the introduction, you should be able to talk to the AI Assistant. Have fun!
+
+## Special features
+
+### Have the AI speak first
+To have the AI voice assistant talk before the user, uncomment the line `// sendInitialConversationItem();`. The initial greeting is controlled in `sendInitialConversationItem`.
+
+### Interrupt handling/AI preemption
+When the user speaks and OpenAI sends `input_audio_buffer.speech_started`, the code will clear the Twilio Media Streams buffer and send OpenAI `conversation.item.truncate`.
+
+Depending on your application's needs, you may want to use the [`input_audio_buffer.speech_stopped`](https://platform.openai.com/docs/api-reference/realtime-server-events/input-audio-buffer-speech-stopped) event, instead.
diff --git a/index.js b/index.js
@@ -1,14 +1,13 @@
 import Fastify from 'fastify';
 import WebSocket from 'ws';
-import fs from 'fs';
 import dotenv from 'dotenv';
 import fastifyFormBody from '@fastify/formbody';
 import fastifyWs from '@fastify/websocket';
 
 // Load environment variables from .env file
 dotenv.config();
 
-// Retrieve the OpenAI API key from environment variables. You must have OpenAI Realtime API access.
+// Retrieve the OpenAI API key from environment variables.
 const { OPENAI_API_KEY } = process.env;
 
 if (!OPENAI_API_KEY) {
@@ -26,7 +25,7 @@ const SYSTEM_MESSAGE = 'You are a helpful and bubbly AI assistant who loves to c
 const VOICE = 'alloy';
 const PORT = process.env.PORT || 5050; // Allow dynamic port assignment
 
-// List of Event Types to log to the console. See OpenAI Realtime API Documentation. (session.updated is handled separately.)
+// List of Event Types to log to the console. See the OpenAI Realtime API Documentation: https://platform.openai.com/docs/api-reference/realtime
 const LOG_EVENT_TYPES = [
     'response.content.done',
     'rate_limits.updated',
@@ -37,6 +36,11 @@ const LOG_EVENT_TYPES = [
     'session.created'
 ];
 
+// Track drift between OpenAI and system clocks, and the assistant's last Item ID
+let localStartTime;
+let lastDrift = null;
+let lastAssistantItem;
+
 // Root Route
 fastify.get('/', async (request, reply) => {
     reply.send({ message: 'Twilio Media Stream Server is running!' });
@@ -63,7 +67,6 @@ fastify.register(async (fastify) => {
     fastify.get('/media-stream', { websocket: true }, (connection, req) => {
         console.log('Client connected');
 
-
         const openAiWs = new WebSocket('wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01', {
             headers: {
                 Authorization: `Bearer ${OPENAI_API_KEY}`,
@@ -73,7 +76,7 @@ fastify.register(async (fastify) => {
 
         let streamSid = null;
 
-        const sendSessionUpdate = () => {
+        const initializeSession = () => {
             const sessionUpdate = {
                 type: 'session.update',
                 session: {
@@ -89,14 +92,39 @@ fastify.register(async (fastify) => {
 
             console.log('Sending session update:', JSON.stringify(sessionUpdate));
             openAiWs.send(JSON.stringify(sessionUpdate));
+
+            // Uncomment the following line to have AI speak first:
+            // sendInitialConversationItem();
+        };
+
+        const sendInitialConversationItem = () => {
+            const initialConversationItem = {
+                type: 'conversation.item.create',
+                item: {
+                    type: 'message',
+                    role: 'user',
+                    content: [
+                        {
+                            type: 'input_text',
+                            text: 'Greet the user with "Hello there! I am an AI voice assistant powered by Twilio and the OpenAI Realtime API. You can ask me for facts, jokes, or anything you can imagine. How can I help you?"'
+                        }
+                    ]
+                }
+            };
+
+            console.log('Sending initial conversation item:', JSON.stringify(initialConversationItem));
+            openAiWs.send(JSON.stringify(initialConversationItem));
+            openAiWs.send(JSON.stringify({ type: 'response.create' }));
         };
 
         // Open event for OpenAI WebSocket
         openAiWs.on('open', () => {
+            localStartTime = Date.now(); // Start local timer
             console.log('Connected to the OpenAI Realtime API');
-            setTimeout(sendSessionUpdate, 250); // Ensure connection stability, send after .25 seconds
+            setTimeout(initializeSession, 100);
         });
 
+
         // Listen for messages from the OpenAI WebSocket (and send to Twilio if necessary)
         openAiWs.on('message', (data) => {
             try {
@@ -118,11 +146,65 @@ fastify.register(async (fastify) => {
                     };
                     connection.send(JSON.stringify(audioDelta));
                 }
+
+                // We can get the following event while Twilio is still playing audio from the AI
+                if (response.type === 'input_audio_buffer.speech_started') {
+                    handleSpeechStartedEvent(response);
+                }
+
+                if (response.type === 'response.done') {
+                    handleResponseDoneEvent(response);
+                }
             } catch (error) {
                 console.error('Error processing OpenAI message:', error, 'Raw message:', data);
             }
         });
 
+        // Interruption handling
+        const handleSpeechStartedEvent = (response) => {
+            const localTime = Date.now();
+            const drift = localTime - localStartTime - response.audio_start_ms;
+
+            console.log('OpenAI Speech started at', response.audio_start_ms, 'ms from OpenAI perspective');
+            console.log('Local time at speech start:', localTime - localStartTime, 'ms');
+            console.log('Time drift (OpenAI - Local):', drift, 'ms');
+
+            if (lastDrift === null || drift !== lastDrift) {
+                console.log('Drift has changed. Previous:', lastDrift, 'Current:', drift);
+                lastDrift = drift;
+            }
+
+            if (streamSid) {
+                connection.send(JSON.stringify({
+                    event: 'clear',
+                    streamSid: streamSid
+                }));
+            }
+
+            if (lastAssistantItem) {
+                const truncateEvent = {
+                    type: 'conversation.item.truncate',
+                    item_id: lastAssistantItem,
+                    content_index: 0,
+                    audio_end_ms: response.audio_start_ms
+                };
+                console.log('Sending truncation event:', JSON.stringify(truncateEvent));
+                openAiWs.send(JSON.stringify(truncateEvent));
+                lastAssistantItem = null;
+            }
+        };
+
+        // Interruption handling requires knowing the preempted conversation's ID
+        const handleResponseDoneEvent = (response) => {
+            const outputItems = response.response.output;
+            for (const item of outputItems) {
+                if (item.role === 'assistant') {
+                    lastAssistantItem = item.id;
+                    break; // Consider the first relevant assistant item
+                }
+            }
+        };
+
         // Handle incoming messages from Twilio
         connection.on('message', (message) => {
             try {
@@ -135,7 +217,6 @@ fastify.register(async (fastify) => {
                                 type: 'input_audio_buffer.append',
                                 audio: data.media.payload
                             };
-
                             openAiWs.send(JSON.stringify(audioAppend));
                         }
                         break;
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "speech-assistant-openai-realtime-api-node",
-  "version": "1.0.0",
+  "version": "1.0.1",
   "description": "",
   "main": "index.js",
   "scripts": {

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "speech-assistant-openai-realtime-api-node",`
`3`		`- "version": "1.0.0",`
	`3`	`+ "version": "1.0.1",`
`4`	`4`	`"description": "",`
`5`	`5`	`"main": "index.js",`
`6`	`6`	`"scripts": {`