178 lines
5.2 KiB
Python
178 lines
5.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Whisper Web UI - Browser-based Speech-to-Text
|
|
Runs locally on Jetson Orin Nano with GPU acceleration
|
|
"""
|
|
|
|
import gradio as gr
|
|
import whisper
|
|
import torch
|
|
import os
|
|
import tempfile
|
|
|
|
# Load model (use GPU if available)
|
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
print(f"Using device: {device}")
|
|
|
|
MODEL_SIZE = os.getenv("WHISPER_MODEL", "base")
|
|
print(f"Loading Whisper model: {MODEL_SIZE}")
|
|
model = whisper.load_model(MODEL_SIZE, device=device)
|
|
print("Model loaded!")
|
|
|
|
|
|
def transcribe_audio(audio_path, language="auto", task="transcribe"):
|
|
"""Transcribe audio file using Whisper"""
|
|
if audio_path is None:
|
|
return "No audio provided"
|
|
|
|
try:
|
|
# Transcribe
|
|
options = {
|
|
"task": task,
|
|
"fp16": device == "cuda"
|
|
}
|
|
|
|
if language != "auto":
|
|
options["language"] = language
|
|
|
|
result = model.transcribe(audio_path, **options)
|
|
|
|
# Format output
|
|
text = result["text"]
|
|
detected_lang = result.get("language", "unknown")
|
|
|
|
output = f"**Detected Language:** {detected_lang}\n\n"
|
|
output += f"**Transcription:**\n{text}\n\n"
|
|
|
|
# Add segments with timestamps
|
|
output += "**Segments:**\n"
|
|
for segment in result["segments"]:
|
|
start = segment["start"]
|
|
end = segment["end"]
|
|
segment_text = segment["text"]
|
|
output += f"[{start:.2f}s - {end:.2f}s] {segment_text}\n"
|
|
|
|
return output
|
|
|
|
except Exception as e:
|
|
return f"Error: {str(e)}"
|
|
|
|
|
|
def transcribe_microphone(audio, language="auto", task="transcribe"):
|
|
"""Transcribe from microphone input"""
|
|
if audio is None:
|
|
return "No audio recorded"
|
|
|
|
# audio is a tuple (sample_rate, numpy_array) from gradio
|
|
sr, audio_data = audio
|
|
|
|
# Save to temp file
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
|
|
import scipy.io.wavfile as wav
|
|
wav.write(f.name, sr, audio_data)
|
|
return transcribe_audio(f.name, language, task)
|
|
|
|
|
|
# Language options
|
|
LANGUAGES = [
|
|
("Auto Detect", "auto"),
|
|
("German", "de"),
|
|
("English", "en"),
|
|
("French", "fr"),
|
|
("Italian", "it"),
|
|
("Spanish", "es"),
|
|
("Portuguese", "pt"),
|
|
("Dutch", "nl"),
|
|
("Russian", "ru"),
|
|
("Chinese", "zh"),
|
|
("Japanese", "ja"),
|
|
("Korean", "ko"),
|
|
]
|
|
|
|
# Create Gradio interface
|
|
with gr.Blocks(title="Whisper Local - Speech to Text", theme=gr.themes.Soft()) as demo:
|
|
gr.Markdown("""
|
|
# 🎤 Whisper Local - Speech to Text
|
|
### Runs 100% locally on Jetson Orin Nano with GPU acceleration
|
|
No cloud, no internet required, your audio stays private.
|
|
""")
|
|
|
|
with gr.Tabs():
|
|
# Tab 1: Microphone
|
|
with gr.TabItem("🎙️ Microphone"):
|
|
gr.Markdown("Record audio directly from your microphone")
|
|
|
|
with gr.Row():
|
|
mic_input = gr.Audio(
|
|
sources=["microphone"],
|
|
type="numpy",
|
|
label="Record Audio"
|
|
)
|
|
|
|
with gr.Row():
|
|
mic_language = gr.Dropdown(
|
|
choices=LANGUAGES,
|
|
value="auto",
|
|
label="Language"
|
|
)
|
|
mic_task = gr.Radio(
|
|
choices=["transcribe", "translate"],
|
|
value="transcribe",
|
|
label="Task (translate = to English)"
|
|
)
|
|
|
|
mic_button = gr.Button("🚀 Transcribe", variant="primary")
|
|
mic_output = gr.Markdown(label="Result")
|
|
|
|
mic_button.click(
|
|
fn=transcribe_microphone,
|
|
inputs=[mic_input, mic_language, mic_task],
|
|
outputs=mic_output
|
|
)
|
|
|
|
# Tab 2: File Upload
|
|
with gr.TabItem("📁 File Upload"):
|
|
gr.Markdown("Upload an audio or video file")
|
|
|
|
with gr.Row():
|
|
file_input = gr.Audio(
|
|
sources=["upload"],
|
|
type="filepath",
|
|
label="Upload Audio/Video"
|
|
)
|
|
|
|
with gr.Row():
|
|
file_language = gr.Dropdown(
|
|
choices=LANGUAGES,
|
|
value="auto",
|
|
label="Language"
|
|
)
|
|
file_task = gr.Radio(
|
|
choices=["transcribe", "translate"],
|
|
value="transcribe",
|
|
label="Task (translate = to English)"
|
|
)
|
|
|
|
file_button = gr.Button("🚀 Transcribe", variant="primary")
|
|
file_output = gr.Markdown(label="Result")
|
|
|
|
file_button.click(
|
|
fn=transcribe_audio,
|
|
inputs=[file_input, file_language, file_task],
|
|
outputs=file_output
|
|
)
|
|
|
|
gr.Markdown(f"""
|
|
---
|
|
**Model:** {MODEL_SIZE} | **Device:** {device} | **GPU:** {'✅ CUDA' if device == 'cuda' else '❌ CPU'}
|
|
""")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
demo.launch(
|
|
server_name="0.0.0.0",
|
|
server_port=7860,
|
|
share=False,
|
|
ssl_verify=False
|
|
)
|