From ce0c34825f91afe865ed047acbfea542e47c9173 Mon Sep 17 00:00:00 2001 From: Daniil Okhlopkov <5613295+ohld@users.noreply.github.com> Date: Thu, 2 Nov 2023 13:52:35 +0000 Subject: [PATCH] Store Voice temp files in memory and don't convert them --- bot/bot.py | 31 +++++++++---------------------- bot/openai_utils.py | 4 ++-- requirements.txt | 3 +-- 3 files changed, 12 insertions(+), 26 deletions(-) diff --git a/bot/bot.py b/bot/bot.py index 5a9e1da..f4510a6 100644 --- a/bot/bot.py +++ b/bot/bot.py @@ -1,12 +1,9 @@ -import os +import io import logging import asyncio import traceback import html import json -import tempfile -import pydub -from pathlib import Path from datetime import datetime import openai @@ -342,25 +339,15 @@ async def voice_message_handle(update: Update, context: CallbackContext): db.set_user_attribute(user_id, "last_interaction", datetime.now()) voice = update.message.voice - with tempfile.TemporaryDirectory() as tmp_dir: - tmp_dir = Path(tmp_dir) - voice_ogg_path = tmp_dir / "voice.ogg" - - # download - voice_file = await context.bot.get_file(voice.file_id) - await voice_file.download_to_drive(voice_ogg_path) - - # convert to mp3 - voice_mp3_path = tmp_dir / "voice.mp3" - pydub.AudioSegment.from_file(voice_ogg_path).export(voice_mp3_path, format="mp3") - - # transcribe - with open(voice_mp3_path, "rb") as f: - transcribed_text = await openai_utils.transcribe_audio(f) - - if transcribed_text is None: - transcribed_text = "" + voice_file = await context.bot.get_file(voice.file_id) + + # store file in memory, not on disk + buf = io.BytesIO() + await voice_file.download_to_memory(buf) + buf.name = "voice.oga" # file extension is required + buf.seek(0) # move cursor to the beginning of the buffer + transcribed_text = await openai_utils.transcribe_audio(buf) text = f"🎤: {transcribed_text}" await update.message.reply_text(text, parse_mode=ParseMode.HTML) diff --git a/bot/openai_utils.py b/bot/openai_utils.py index 7b06e77..5122c60 100644 --- a/bot/openai_utils.py +++ b/bot/openai_utils.py @@ -189,9 +189,9 @@ class ChatGPT: return n_input_tokens, n_output_tokens -async def transcribe_audio(audio_file): +async def transcribe_audio(audio_file) -> str: r = await openai.Audio.atranscribe("whisper-1", audio_file) - return r["text"] + return r["text"] or "" async def generate_images(prompt, n_images=4, size="512x512"): diff --git a/requirements.txt b/requirements.txt index 354063b..1f82206 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,5 +3,4 @@ openai>=0.27.0 tiktoken>=0.3.0 PyYAML==6.0 pymongo==4.3.3 -python-dotenv==0.21.0 -pydub==0.25.1 \ No newline at end of file +python-dotenv==0.21.0 \ No newline at end of file