Spaces:

gradio
/

stream_asr

Running

File size: 1,458 Bytes

bbf6234

{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: stream_asr"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio torch torchaudio transformers"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from transformers import pipeline\n", "import numpy as np\n", "\n", "transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\")\n", "\n", "def transcribe(stream, new_chunk):\n", "    sr, y = new_chunk\n", "    \n", "    # Convert to mono if stereo\n", "    if y.ndim > 1:\n", "        y = y.mean(axis=1)\n", "        \n", "    y = y.astype(np.float32)\n", "    y /= np.max(np.abs(y))\n", "\n", "    if stream is not None:\n", "        stream = np.concatenate([stream, y])\n", "    else:\n", "        stream = y\n", "    return stream, transcriber({\"sampling_rate\": sr, \"raw\": stream})[\"text\"]  # type: ignore\n", "\n", "demo = gr.Interface(\n", "    transcribe,\n", "    [\"state\", gr.Audio(sources=[\"microphone\"], streaming=True)],\n", "    [\"state\", \"text\"],\n", "    live=True,\n", ")\n", "\n", "if __name__ == \"__main__\":\n", "    demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}