gandalf.hofmanns/whisper-webui/app.py

178 lines
5.2 KiB
Python

#!/usr/bin/env python3
"""
Whisper Web UI - Browser-based Speech-to-Text
Runs locally on Jetson Orin Nano with GPU acceleration
"""
import gradio as gr
import whisper
import torch
import os
import tempfile
# Load model (use GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
MODEL_SIZE = os.getenv("WHISPER_MODEL", "base")
print(f"Loading Whisper model: {MODEL_SIZE}")
model = whisper.load_model(MODEL_SIZE, device=device)
print("Model loaded!")
def transcribe_audio(audio_path, language="auto", task="transcribe"):
"""Transcribe audio file using Whisper"""
if audio_path is None:
return "No audio provided"
try:
# Transcribe
options = {
"task": task,
"fp16": device == "cuda"
}
if language != "auto":
options["language"] = language
result = model.transcribe(audio_path, **options)
# Format output
text = result["text"]
detected_lang = result.get("language", "unknown")
output = f"**Detected Language:** {detected_lang}\n\n"
output += f"**Transcription:**\n{text}\n\n"
# Add segments with timestamps
output += "**Segments:**\n"
for segment in result["segments"]:
start = segment["start"]
end = segment["end"]
segment_text = segment["text"]
output += f"[{start:.2f}s - {end:.2f}s] {segment_text}\n"
return output
except Exception as e:
return f"Error: {str(e)}"
def transcribe_microphone(audio, language="auto", task="transcribe"):
"""Transcribe from microphone input"""
if audio is None:
return "No audio recorded"
# audio is a tuple (sample_rate, numpy_array) from gradio
sr, audio_data = audio
# Save to temp file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
import scipy.io.wavfile as wav
wav.write(f.name, sr, audio_data)
return transcribe_audio(f.name, language, task)
# Language options
LANGUAGES = [
("Auto Detect", "auto"),
("German", "de"),
("English", "en"),
("French", "fr"),
("Italian", "it"),
("Spanish", "es"),
("Portuguese", "pt"),
("Dutch", "nl"),
("Russian", "ru"),
("Chinese", "zh"),
("Japanese", "ja"),
("Korean", "ko"),
]
# Create Gradio interface
with gr.Blocks(title="Whisper Local - Speech to Text", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 🎤 Whisper Local - Speech to Text
### Runs 100% locally on Jetson Orin Nano with GPU acceleration
No cloud, no internet required, your audio stays private.
""")
with gr.Tabs():
# Tab 1: Microphone
with gr.TabItem("🎙️ Microphone"):
gr.Markdown("Record audio directly from your microphone")
with gr.Row():
mic_input = gr.Audio(
sources=["microphone"],
type="numpy",
label="Record Audio"
)
with gr.Row():
mic_language = gr.Dropdown(
choices=LANGUAGES,
value="auto",
label="Language"
)
mic_task = gr.Radio(
choices=["transcribe", "translate"],
value="transcribe",
label="Task (translate = to English)"
)
mic_button = gr.Button("🚀 Transcribe", variant="primary")
mic_output = gr.Markdown(label="Result")
mic_button.click(
fn=transcribe_microphone,
inputs=[mic_input, mic_language, mic_task],
outputs=mic_output
)
# Tab 2: File Upload
with gr.TabItem("📁 File Upload"):
gr.Markdown("Upload an audio or video file")
with gr.Row():
file_input = gr.Audio(
sources=["upload"],
type="filepath",
label="Upload Audio/Video"
)
with gr.Row():
file_language = gr.Dropdown(
choices=LANGUAGES,
value="auto",
label="Language"
)
file_task = gr.Radio(
choices=["transcribe", "translate"],
value="transcribe",
label="Task (translate = to English)"
)
file_button = gr.Button("🚀 Transcribe", variant="primary")
file_output = gr.Markdown(label="Result")
file_button.click(
fn=transcribe_audio,
inputs=[file_input, file_language, file_task],
outputs=file_output
)
gr.Markdown(f"""
---
**Model:** {MODEL_SIZE} | **Device:** {device} | **GPU:** {'✅ CUDA' if device == 'cuda' else '❌ CPU'}
""")
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
ssl_verify=False
)