3
3
from pydantic import BaseModel
4
4
from typing import Optional
5
5
import azure .cognitiveservices .speech as speechsdk
6
- import openai
7
6
from openai import OpenAI
8
7
import pyaudio
9
8
import wave
@@ -47,11 +46,7 @@ class VoiceAssistant:
47
46
def __init__ (self ):
48
47
self .conversation_history = []
49
48
self .temp_dir = tempfile .mkdtemp ()
50
-
51
- # Initialize OpenAI client
52
49
self .openai_client = OpenAI (api_key = os .getenv ('OPENAI_API_KEY' ))
53
-
54
- # Initialize Azure Speech config
55
50
self .speech_config = speechsdk .SpeechConfig (
56
51
subscription = azure_speech_key ,
57
52
region = azure_service_region
@@ -82,7 +77,6 @@ async def record_audio(self) -> bytes:
82
77
data = stream .read (self .CHUNK )
83
78
frames .append (data )
84
79
85
- # Save to temporary WAV file
86
80
temp_path = os .path .join (self .temp_dir , "temp_recording.wav" )
87
81
wf = wave .open (temp_path , 'wb' )
88
82
wf .setnchannels (self .CHANNELS )
@@ -91,7 +85,6 @@ async def record_audio(self) -> bytes:
91
85
wf .writeframes (b'' .join (frames ))
92
86
wf .close ()
93
87
94
- # Read the file as bytes
95
88
with open (temp_path , 'rb' ) as audio_file :
96
89
audio_bytes = audio_file .read ()
97
90
@@ -104,23 +97,18 @@ async def record_audio(self) -> bytes:
104
97
105
98
async def transcribe_audio (self , audio_bytes : bytes ) -> str :
106
99
"""Convert speech to text using OpenAI Whisper"""
107
- print ("🔊 Transcribing audio..." )
108
100
try :
109
101
response = self .openai_client .audio .transcriptions .create (
110
102
model = "whisper-1" ,
111
103
file = ("audio.wav" , audio_bytes ),
112
104
)
113
- print ("✅ Transcription complete!" )
114
- print (response .text + "\n " )
115
105
return response .text
116
106
except Exception as e :
117
107
raise VoiceAssistantError (f"Transcription failed: { str (e )} " )
118
108
119
109
async def get_chat_response (self , text : str ) -> str :
120
110
"""Get response from ChatGPT"""
121
- print ("💬 Getting chat response..." )
122
111
try :
123
- # Add user message to conversation history
124
112
self .conversation_history .append ({"role" : "user" , "content" : text })
125
113
126
114
response = self .openai_client .chat .completions .create (
@@ -130,10 +118,6 @@ async def get_chat_response(self, text: str) -> str:
130
118
)
131
119
132
120
assistant_response = response .choices [0 ].message .content
133
- print ("✅ Chat response complete!" )
134
- print (assistant_response + "\n " )
135
-
136
- # Add assistant response to conversation history
137
121
self .conversation_history .append ({"role" : "assistant" , "content" : assistant_response })
138
122
139
123
return assistant_response
@@ -154,7 +138,6 @@ async def synthesize_speech(self, text: str) -> str:
154
138
result = synthesizer .speak_text_async (text ).get ()
155
139
156
140
if result .reason == speechsdk .ResultReason .SynthesizingAudioCompleted :
157
- print ("✅ Speech synthesis complete!" )
158
141
return output_path
159
142
else :
160
143
raise VoiceAssistantError ("Speech synthesis failed" )
@@ -165,17 +148,11 @@ async def synthesize_speech(self, text: str) -> str:
165
148
async def process_voice_input (self , audio_data : bytes = None ) -> tuple [str , str ]:
166
149
"""Process voice input and return response text and audio file path"""
167
150
try :
168
- # Record audio if not provided
169
151
if audio_data is None :
170
152
audio_data = await self .record_audio ()
171
153
172
- # Convert speech to text
173
154
transcript = await self .transcribe_audio (audio_data )
174
-
175
- # Get ChatGPT response
176
155
response_text = await self .get_chat_response (transcript )
177
-
178
- # Convert response to speech
179
156
audio_path = await self .synthesize_speech (response_text )
180
157
181
158
return response_text , audio_path
@@ -191,7 +168,6 @@ def cleanup(self):
191
168
except Exception :
192
169
pass
193
170
194
- # API Models
195
171
class ChatResponse (BaseModel ):
196
172
text : str
197
173
audio_path : str
@@ -209,19 +185,15 @@ async def chat_endpoint(audio_file: UploadFile = File(None)):
209
185
try :
210
186
audio_data = None
211
187
if audio_file :
212
- # Read uploaded file
213
188
audio_data = await audio_file .read ()
214
189
215
190
response_text , audio_path = await assistant .process_voice_input (audio_data )
216
191
217
- # Read the audio file into memory before cleanup
218
192
with open (audio_path , 'rb' ) as f :
219
193
audio_content = f .read ()
220
194
221
- # Clean up files
222
195
assistant .cleanup ()
223
196
224
- # Create a new temporary file for the response
225
197
temp_response_path = tempfile .mktemp (suffix = '.wav' )
226
198
with open (temp_response_path , 'wb' ) as f :
227
199
f .write (audio_content )
0 commit comments