#!/usr/bin/env python3 """ Whisper Web UI - Browser-based Speech-to-Text Runs locally on Jetson Orin Nano with GPU acceleration """ import gradio as gr import whisper import torch import os import tempfile # Load model (use GPU if available) device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") MODEL_SIZE = os.getenv("WHISPER_MODEL", "base") print(f"Loading Whisper model: {MODEL_SIZE}") model = whisper.load_model(MODEL_SIZE, device=device) print("Model loaded!") def transcribe_audio(audio_path, language="auto", task="transcribe"): """Transcribe audio file using Whisper""" if audio_path is None: return "No audio provided" try: # Transcribe options = { "task": task, "fp16": device == "cuda" } if language != "auto": options["language"] = language result = model.transcribe(audio_path, **options) # Format output text = result["text"] detected_lang = result.get("language", "unknown") output = f"**Detected Language:** {detected_lang}\n\n" output += f"**Transcription:**\n{text}\n\n" # Add segments with timestamps output += "**Segments:**\n" for segment in result["segments"]: start = segment["start"] end = segment["end"] segment_text = segment["text"] output += f"[{start:.2f}s - {end:.2f}s] {segment_text}\n" return output except Exception as e: return f"Error: {str(e)}" def transcribe_microphone(audio, language="auto", task="transcribe"): """Transcribe from microphone input""" if audio is None: return "No audio recorded" # audio is a tuple (sample_rate, numpy_array) from gradio sr, audio_data = audio # Save to temp file with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: import scipy.io.wavfile as wav wav.write(f.name, sr, audio_data) return transcribe_audio(f.name, language, task) # Language options LANGUAGES = [ ("Auto Detect", "auto"), ("German", "de"), ("English", "en"), ("French", "fr"), ("Italian", "it"), ("Spanish", "es"), ("Portuguese", "pt"), ("Dutch", "nl"), ("Russian", "ru"), ("Chinese", "zh"), ("Japanese", "ja"), ("Korean", "ko"), ] # Create Gradio interface with gr.Blocks(title="Whisper Local - Speech to Text", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🎤 Whisper Local - Speech to Text ### Runs 100% locally on Jetson Orin Nano with GPU acceleration No cloud, no internet required, your audio stays private. """) with gr.Tabs(): # Tab 1: Microphone with gr.TabItem("🎙️ Microphone"): gr.Markdown("Record audio directly from your microphone") with gr.Row(): mic_input = gr.Audio( sources=["microphone"], type="numpy", label="Record Audio" ) with gr.Row(): mic_language = gr.Dropdown( choices=LANGUAGES, value="auto", label="Language" ) mic_task = gr.Radio( choices=["transcribe", "translate"], value="transcribe", label="Task (translate = to English)" ) mic_button = gr.Button("🚀 Transcribe", variant="primary") mic_output = gr.Markdown(label="Result") mic_button.click( fn=transcribe_microphone, inputs=[mic_input, mic_language, mic_task], outputs=mic_output ) # Tab 2: File Upload with gr.TabItem("📁 File Upload"): gr.Markdown("Upload an audio or video file") with gr.Row(): file_input = gr.Audio( sources=["upload"], type="filepath", label="Upload Audio/Video" ) with gr.Row(): file_language = gr.Dropdown( choices=LANGUAGES, value="auto", label="Language" ) file_task = gr.Radio( choices=["transcribe", "translate"], value="transcribe", label="Task (translate = to English)" ) file_button = gr.Button("🚀 Transcribe", variant="primary") file_output = gr.Markdown(label="Result") file_button.click( fn=transcribe_audio, inputs=[file_input, file_language, file_task], outputs=file_output ) gr.Markdown(f""" --- **Model:** {MODEL_SIZE} | **Device:** {device} | **GPU:** {'✅ CUDA' if device == 'cuda' else '❌ CPU'} """) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False, ssl_verify=False )