Spaces:

wetdog
/

MOSA-Net_plus

Running on Zero

App Files Files Community

wetdog commited on May 29

Commit

4876346

•

1 Parent(s): c17cc3e

Add mosanet gradio demo

Browse files

Files changed (4) hide show

Dockerfile +48 -0
app.py +103 -0
modules.py +152 -0
requirements.txt +5 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,48 @@

+# Use an official Python runtime as a parent image
+FROM python:3.10.12-slim
+# Install required packages for building eSpeak and general utilities
+RUN apt-get update && apt-get install -y \
+        build-essential \
+        autoconf \
+        automake \
+        libtool \
+        pkg-config \
+        git \
+        cmake \
+        ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+RUN pip install --upgrade pip
+RUN mkdir -p cache && chmod 777 cache
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+# Onnx install
+COPY --chown=user requirements.txt $HOME/app/
+RUN pip install -r requirements.txt
+COPY --chown=user . $HOME/app/
+# Fix ownership issues
+USER root
+RUN chown -R user:user $HOME/app
+USER user
+EXPOSE 7860
+CMD ["python3", "-u", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import os
+import torch
+from transformers import AutoFeatureExtractor, WhisperModel, AutoModelForSpeechSeq2Seq
+import numpy as np
+import torchaudio
+import librosa
+import gradio as gr
+from modules import load_audio, MosPredictor, denorm
+mos_checkpoint = "ckpt_mosa_net_plus"
+print('Loading MOSANET+ checkpoint...')
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+model = MosPredictor().to(device)
+model.eval()
+model.load_state_dict(torch.load(mos_checkpoint, map_location=device))
+print('Loading Whisper checkpoint...')
+feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-large-v3")
+#model_asli = WhisperModel.from_pretrained("openai/whisper-large-v3")
+model_asli = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v3", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation="sdpa")
+model_asli = model_asli.to(device)
+def predict_mos(wavefile:str):
+    print('Starting prediction...')
+    # STFT
+    wav = torchaudio.load(wavefile)[0]
+    lps = torch.from_numpy(np.expand_dims(np.abs(librosa.stft(wav[0].detach().numpy(), n_fft = 512, hop_length=256,win_length=512)).T, axis=0))
+    lps = lps.unsqueeze(1)
+    # Whisper Feature
+    audio = load_audio(wavefile)
+    inputs = feature_extractor(audio, return_tensors="pt")
+    input_features = inputs.input_features
+    input_features = input_features.to(device)
+    with torch.no_grad():
+        decoder_input_ids = torch.tensor([[1, 1]]) * model_asli.config.decoder_start_token_id
+        decoder_input_ids =  decoder_input_ids.to(device)
+        last_hidden_state = model_asli(input_features, decoder_input_ids=decoder_input_ids).encoder_last_hidden_state
+        whisper_feat = last_hidden_state
+    print('Model features shapes...')
+    print(whisper_feat.shape)
+    print(wav.shape)
+    print(lps.shape)
+    # prediction
+    wav = wav.to(device)
+    lps = lps.to(device)
+    Quality_1, Intell_1, frame1, frame2 = model(wav ,lps, whisper_feat)
+    quality_pred = Quality_1.cpu().detach().numpy()[0]
+    intell_pred = Intell_1.cpu().detach().numpy()[0]
+    print("predictions")
+    qa_text = f"Quality: {denorm(quality_pred)[0]:.2f}  Inteligibility: {intell_pred[0]:.2f}"
+    print(qa_text)
+    return qa_text
+title =  """
+<div style="text-align: center; max-width: 700px; margin: 0 auto;">
+    <div
+        style="display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;"
+    > <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
+        MOSA-Net Whisper features
+    </h1> </div>
+</div>
+"""
+description = """
+This is a demo of [MOSA-Net+](https://github.com/dhimasryan/MOSA-Net-Cross-Domain/tree/main/MOSA_Net%2B),
+an enhanced version of the multi-objective speech assessment model MOSA-Net, by leveraging the acoustic features from Whisper, a large-scaled weakly supervised model.
+MOSA-Net+ was tested in the noisy-and-enhanced track of the VoiceMOS Challenge 2023, where it obtained the top-ranked performance among nine systems [full paper](https://arxiv.org/abs/2309.12766)
+"""
+article = """
+If the model contributes to your research please cite the following work:
+R. E. Zezario, S. -W. Fu, F. Chen, C. -S. Fuh, H. -M. Wang and Y. Tsao, "Deep Learning-Based Non-Intrusive Multi-Objective Speech Assessment Model With Cross-Domain Features," in IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 31, pp. 54-70, 2023, doi: 10.1109/TASLP.2022.3205757.
+R. E. Zezario, Y.-W. Chen, S.-W. Fu, Y. Tsao, H.-M. Wang, C.-S. Fuh, "A Study on Incorporating Whisper for Robust Speech Assessment," IEEE ICME 2024, July 2024, (Top Performance on the Track 3 - VoiceMOS Challenge 2023)"
+demo contributed by [@wetdog](https://github.com/wetdog)
+"""
+demo = gr.Blocks()
+with demo:
+    gr.Markdown(title)
+    gr.Markdown(description)
+    gr.Interface(
+    fn=predict_mos,
+    inputs=gr.Audio(type='filepath'),
+    outputs="text",
+    allow_flagging=False,)
+    gr.Markdown(article)
+demo.queue(max_size=10)
+demo.launch(show_api=False, server_name="0.0.0.0", server_port=7860)

modules.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import os
+import torch
+import argparse
+import numpy as np
+from transformers import AutoFeatureExtractor, WhisperModel
+import torchaudio
+import torch.nn as nn
+import torch.nn.functional as F
+import speechbrain
+import librosa
+from subprocess import CalledProcessError, run
+#openai whispers load audio
+SAMPLE_RATE=16000
+def denorm(input_x):
+    input_x = input_x*(5-0) + 0
+    return input_x
+def load_audio(file: str, sr: int = SAMPLE_RATE):
+    """
+    Open an audio file and read as mono waveform, resampling as necessary
+    Parameters
+    ----------
+    file: str
+        The audio file to open
+    sr: int
+        The sample rate to resample the audio if necessary
+    Returns
+    -------
+    A NumPy array containing the audio waveform, in float32 dtype.
+    """
+    # This launches a subprocess to decode audio while down-mixing
+    # and resampling as necessary.  Requires the ffmpeg CLI in PATH.
+    # fmt: off
+    cmd = [
+        "ffmpeg",
+        "-nostdin",
+        "-threads", "0",
+        "-i", file,
+        "-f", "s16le",
+        "-ac", "1",
+        "-acodec", "pcm_s16le",
+        "-ar", str(sr),
+        "-"
+    ]
+    # fmt: on
+    try:
+        out = run(cmd, capture_output=True, check=True).stdout
+    except CalledProcessError as e:
+        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+class MosPredictor(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.mean_net_conv = nn.Sequential(
+            nn.Conv2d(in_channels = 1, out_channels = 16, kernel_size = (3,3), padding = (1,1)),
+            nn.Conv2d(in_channels = 16, out_channels = 16, kernel_size = (3,3), padding = (1,1)),
+            nn.Conv2d(in_channels = 16, out_channels = 16, kernel_size = (3,3), padding = (1,1), stride=(1,3)),
+            nn.Dropout(0.3),
+            nn.BatchNorm2d(16),
+            nn.ReLU(),
+            nn.Conv2d(in_channels = 16, out_channels = 32, kernel_size = (3,3), padding = (1,1)),
+            nn.Conv2d(in_channels = 32, out_channels = 32, kernel_size = (3,3), padding = (1,1)),
+            nn.Conv2d(in_channels = 32, out_channels = 32, kernel_size = (3,3), padding = (1,1), stride=(1,3)),
+            nn.Dropout(0.3),
+            nn.BatchNorm2d(32),
+            nn.ReLU(),
+            nn.Conv2d(in_channels = 32, out_channels = 64, kernel_size = (3,3), padding = (1,1)),
+            nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = (3,3), padding = (1,1)),
+            nn.Conv2d(in_channels = 64, out_channels = 64, kernel_size = (3,3), padding = (1,1), stride=(1,3)),
+            nn.Dropout(0.3),
+            nn.BatchNorm2d(64),
+            nn.ReLU(),
+            nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size = (3,3), padding = (1,1)),
+            nn.Conv2d(in_channels = 128, out_channels = 128, kernel_size = (3,3), padding = (1,1)),
+            nn.Conv2d(in_channels = 128, out_channels = 128, kernel_size = (3,3), padding = (1,1), stride=(1,3)),
+            nn.Dropout(0.3),
+            nn.BatchNorm2d(128),
+            nn.ReLU())
+        self.relu_ = nn.ReLU()
+        self.sigmoid_ = nn.Sigmoid()
+        self.ssl_features = 1280
+        self.dim_layer = nn.Linear(self.ssl_features, 512)
+        self.mean_net_rnn = nn.LSTM(input_size = 512, hidden_size = 128, num_layers = 1, batch_first = True, bidirectional = True)
+        self.mean_net_dnn = nn.Sequential(
+            nn.Linear(256, 128),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+        )
+        self.sinc = speechbrain.nnet.CNN.SincConv(in_channels=1, out_channels=257, kernel_size=251, stride=256, sample_rate=16000)
+        self.att_output_layer_quality = nn.MultiheadAttention(128, num_heads=8)
+        self.output_layer_quality = nn.Linear(128, 1)
+        self.qualaverage_score = nn.AdaptiveAvgPool1d(1)
+        self.att_output_layer_intell = nn.MultiheadAttention(128, num_heads=8)
+        self.output_layer_intell = nn.Linear(128, 1)
+        self.intellaverage_score = nn.AdaptiveAvgPool1d(1)
+        self.att_output_layer_stoi= nn.MultiheadAttention(128, num_heads=8)
+        self.output_layer_stoi = nn.Linear(128, 1)
+        self.stoiaverage_score = nn.AdaptiveAvgPool1d(1)
+    def new_method(self):
+        self.sin_conv
+    def forward(self, wav, lps, whisper):
+        #SSL Features
+        wav_ = wav.squeeze(1)  ## [batches, audio_len]
+        ssl_feat_red = self.dim_layer(whisper.squeeze(1))
+        ssl_feat_red = self.relu_(ssl_feat_red)
+        #PS Features
+        sinc_feat=self.sinc(wav.squeeze(1))
+        unsq_sinc =  torch.unsqueeze(sinc_feat, axis=1)
+        concat_lps_sinc = torch.cat((lps,unsq_sinc), axis=2)
+        cnn_out = self.mean_net_conv(concat_lps_sinc)
+        batch = concat_lps_sinc.shape[0]
+        time = concat_lps_sinc.shape[2]
+        re_cnn = cnn_out.view((batch, time, 512))
+        concat_feat = torch.cat((re_cnn,ssl_feat_red), axis=1)
+        out_lstm, (h, c) = self.mean_net_rnn(concat_feat)
+        out_dense = self.mean_net_dnn(out_lstm) # (batch, seq, 1)
+        quality_att, _ = self.att_output_layer_quality (out_dense, out_dense, out_dense)
+        frame_quality = self.output_layer_quality(quality_att)
+        frame_quality = self.sigmoid_(frame_quality)
+        quality_utt = self.qualaverage_score(frame_quality.permute(0,2,1))
+        int_att, _ = self.att_output_layer_intell (out_dense, out_dense, out_dense)
+        frame_int = self.output_layer_intell(int_att)
+        frame_int = self.sigmoid_(frame_int)
+        int_utt = self.intellaverage_score(frame_int.permute(0,2,1))
+        return quality_utt.squeeze(1), int_utt.squeeze(1), frame_quality.squeeze(2), frame_int.squeeze(2)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+transformers
+speechbrain
+librosa
+gradio
+accelerate