From ce0c34825f91afe865ed047acbfea542e47c9173 Mon Sep 17 00:00:00 2001
From: Daniil Okhlopkov <5613295+ohld@users.noreply.github.com>
Date: Thu, 2 Nov 2023 13:52:35 +0000
Subject: [PATCH] Store Voice temp files in memory and don't convert them

---
 bot/bot.py          | 31 +++++++++----------------------
 bot/openai_utils.py |  4 ++--
 requirements.txt    |  3 +--
 3 files changed, 12 insertions(+), 26 deletions(-)
diff --git a/bot/bot.py b/bot/bot.py
index 5a9e1da..f4510a6 100644
--- a/bot/bot.py
+++ b/bot/bot.py
@@ -1,12 +1,9 @@
-import os
+import io
 import logging
 import asyncio
 import traceback
 import html
 import json
-import tempfile
-import pydub
-from pathlib import Path
 from datetime import datetime
 import openai
 
@@ -342,25 +339,15 @@ async def voice_message_handle(update: Update, context: CallbackContext):
     db.set_user_attribute(user_id, "last_interaction", datetime.now())
 
     voice = update.message.voice
-    with tempfile.TemporaryDirectory() as tmp_dir:
-        tmp_dir = Path(tmp_dir)
-        voice_ogg_path = tmp_dir / "voice.ogg"
-
-        # download
-        voice_file = await context.bot.get_file(voice.file_id)
-        await voice_file.download_to_drive(voice_ogg_path)
-
-        # convert to mp3
-        voice_mp3_path = tmp_dir / "voice.mp3"
-        pydub.AudioSegment.from_file(voice_ogg_path).export(voice_mp3_path, format="mp3")
-
-        # transcribe
-        with open(voice_mp3_path, "rb") as f:
-            transcribed_text = await openai_utils.transcribe_audio(f)
-
-            if transcribed_text is None:
-                 transcribed_text = ""
+    voice_file = await context.bot.get_file(voice.file_id)
+    
+    # store file in memory, not on disk
+    buf = io.BytesIO()
+    await voice_file.download_to_memory(buf)
+    buf.name = "voice.oga"  # file extension is required
+    buf.seek(0)  # move cursor to the beginning of the buffer
 
+    transcribed_text = await openai_utils.transcribe_audio(buf)
     text = f"🎤: <i>{transcribed_text}</i>"
     await update.message.reply_text(text, parse_mode=ParseMode.HTML)
 
diff --git a/bot/openai_utils.py b/bot/openai_utils.py
index 7b06e77..5122c60 100644
--- a/bot/openai_utils.py
+++ b/bot/openai_utils.py
@@ -189,9 +189,9 @@ class ChatGPT:
         return n_input_tokens, n_output_tokens
 
 
-async def transcribe_audio(audio_file):
+async def transcribe_audio(audio_file) -> str:
     r = await openai.Audio.atranscribe("whisper-1", audio_file)
-    return r["text"]
+    return r["text"] or ""
 
 
 async def generate_images(prompt, n_images=4, size="512x512"):
diff --git a/requirements.txt b/requirements.txt
index 354063b..1f82206 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,5 +3,4 @@ openai>=0.27.0
 tiktoken>=0.3.0
 PyYAML==6.0
 pymongo==4.3.3
-python-dotenv==0.21.0
-pydub==0.25.1
\ No newline at end of file
+python-dotenv==0.21.0
\ No newline at end of file