diff --git a/README.md b/README.md index 1ebbfc2..34c8aff 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ const client = new RealtimeClient({ apiKey: process.env.OPENAI_API_KEY }); client.updateSession({ instructions: 'You are a great, upbeat friend.' }); client.updateSession({ voice: 'alloy' }); client.updateSession({ - turn_detection: { type: 'none' }, // or 'server_vad' + turn_detection: { type: 'none' }, // or 'server_vad' or 'semantic_vad' input_audio_transcription: { model: 'whisper-1' }, }); diff --git a/dist/lib/client.d.ts b/dist/lib/client.d.ts index 7ea2201..24e1a3a 100644 --- a/dist/lib/client.d.ts +++ b/dist/lib/client.d.ts @@ -13,6 +13,13 @@ * @property {number} [prefix_padding_ms] * @property {number} [silence_duration_ms] */ +/** + * @typedef {Object} TurnDetectionSemanticVadType + * @property {"semantic_vad"} type + * @property {boolean} [create_response] + * @property {boolean} [interrupt_response] + * @property {"auto"|"low"|"medium"|"high"} [eagerness] + */ /** * Tool definitions * @typedef {Object} ToolDefinitionType @@ -30,7 +37,7 @@ * @property {AudioFormatType} [input_audio_format] * @property {AudioFormatType} [output_audio_format] * @property {AudioTranscriptionType|null} [input_audio_transcription] - * @property {TurnDetectionServerVadType|null} [turn_detection] + * @property {TurnDetectionServerVadType|TurnDetectionSemanticVadType|null} [turn_detection] * @property {ToolDefinitionType[]} [tools] * @property {"auto"|"none"|"required"|{type:"function",name:string}} [tool_choice] * @property {number} [temperature] @@ -235,9 +242,9 @@ export class RealtimeClient extends RealtimeEventHandler { disconnect(): void; /** * Gets the active turn detection mode - * @returns {"server_vad"|null} + * @returns {"server_vad"|"semantic_vad"|null} */ - getTurnDetectionType(): "server_vad" | null; + getTurnDetectionType(): "server_vad" | "semantic_vad" | null; /** * Add a tool and handler * @param {ToolDefinitionType} definition @@ -321,6 +328,12 @@ export type TurnDetectionServerVadType = { prefix_padding_ms?: number; silence_duration_ms?: number; }; +export type TurnDetectionSemanticVadType = { + type: "semantic_vad"; + create_response?: boolean; + interrupt_response?: boolean; + eagerness?: "auto" | "low" | "medium" | "high"; +}; /** * Tool definitions */ @@ -341,7 +354,7 @@ export type SessionResourceType = { input_audio_format?: AudioFormatType; output_audio_format?: AudioFormatType; input_audio_transcription?: AudioTranscriptionType | null; - turn_detection?: TurnDetectionServerVadType | null; + turn_detection?: TurnDetectionServerVadType | TurnDetectionSemanticVadType | null; tools?: ToolDefinitionType[]; tool_choice?: "auto" | "none" | "required" | { type: "function"; diff --git a/lib/api.js b/lib/api.js index cc8f904..7681c69 100644 --- a/lib/api.js +++ b/lib/api.js @@ -56,7 +56,7 @@ export class RealtimeAPI extends RealtimeEventHandler { * @param {{model?: string}} [settings] * @returns {Promise} */ - async connect({ model } = { model: 'gpt-4o-realtime-preview-2024-10-01' }) { + async connect({ model } = { model: 'gpt-4o-realtime-preview-2024-12-17' }) { if (!this.apiKey && this.url === this.defaultUrl) { console.warn(`No apiKey provided for connection to "${this.url}"`); } @@ -73,7 +73,10 @@ export class RealtimeAPI extends RealtimeEventHandler { ); } const WebSocket = globalThis.WebSocket; - const ws = new WebSocket(`${this.url}${model ? `?model=${model}` : ''}`, [ + const url = new URL(this.url); + url.searchParams.set('model', model); + + const ws = new WebSocket(url.toString(), [ 'realtime', `openai-insecure-api-key.${this.apiKey}`, 'openai-beta.realtime-v1', @@ -113,7 +116,7 @@ export class RealtimeAPI extends RealtimeEventHandler { const wsModule = await import(/* webpackIgnore: true */ moduleName); const WebSocket = wsModule.default; const ws = new WebSocket( - 'wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01', + `wss://api.openai.com/v1/realtime?model=${model}`, [], { finishRequest: (request) => { diff --git a/lib/client.js b/lib/client.js index 2c48d7f..ec34813 100644 --- a/lib/client.js +++ b/lib/client.js @@ -21,6 +21,14 @@ import { RealtimeUtils } from './utils.js'; * @property {number} [silence_duration_ms] */ +/** + * @typedef {Object} TurnDetectionSemanticVadType + * @property {"semantic_vad"} type + * @property {boolean} [create_response] + * @property {boolean} [interrupt_response] + * @property {"auto"|"low"|"medium"|"high"} [eagerness] + */ + /** * Tool definitions * @typedef {Object} ToolDefinitionType @@ -39,7 +47,7 @@ import { RealtimeUtils } from './utils.js'; * @property {AudioFormatType} [input_audio_format] * @property {AudioFormatType} [output_audio_format] * @property {AudioTranscriptionType|null} [input_audio_transcription] - * @property {TurnDetectionServerVadType|null} [turn_detection] + * @property {TurnDetectionServerVadType|TurnDetectionSemanticVadType|null} [turn_detection] * @property {ToolDefinitionType[]} [tools] * @property {"auto"|"none"|"required"|{type:"function",name:string}} [tool_choice] * @property {number} [temperature] @@ -218,6 +226,12 @@ export class RealtimeClient extends RealtimeEventHandler { prefix_padding_ms: 300, // How much audio to include in the audio stream before the speech starts. silence_duration_ms: 200, // How long to wait to mark the speech as stopped. }; + this.defaultSemanticVadConfig = { + type: 'semantic_vad', + create_response: true, + interrupt_response: false, + eagerness: 'auto', + }; this.realtime = new RealtimeAPI({ url, apiKey, @@ -423,7 +437,7 @@ export class RealtimeClient extends RealtimeEventHandler { /** * Gets the active turn detection mode - * @returns {"server_vad"|null} + * @returns {"server_vad"|"semantic_vad"|null} */ getTurnDetectionType() { return this.sessionConfig.turn_detection?.type || null; @@ -504,8 +518,24 @@ export class RealtimeClient extends RealtimeEventHandler { input_audio_transcription !== void 0 && (this.sessionConfig.input_audio_transcription = input_audio_transcription); - turn_detection !== void 0 && - (this.sessionConfig.turn_detection = turn_detection); + + // Apply turn detection config with defaults if needed + if (turn_detection !== void 0) { + if (turn_detection?.type === 'semantic_vad') { + this.sessionConfig.turn_detection = { + ...this.defaultSemanticVadConfig, + ...turn_detection, + }; + } else if (turn_detection?.type === 'server_vad') { + this.sessionConfig.turn_detection = { + ...this.defaultServerVadConfig, + ...turn_detection, + }; + } else { + this.sessionConfig.turn_detection = turn_detection; + } + } + tools !== void 0 && (this.sessionConfig.tools = tools); tool_choice !== void 0 && (this.sessionConfig.tool_choice = tool_choice); temperature !== void 0 && (this.sessionConfig.temperature = temperature);