import io import json import os import gradio as gr import librosa import numpy as np import soundfile import torch import logging from egs.visinger2.models import SynthesizerTrn from infer import infer_ds from utils import utils logging.getLogger('numba').setLevel(logging.WARNING) logging.getLogger('markdown_it').setLevel(logging.WARNING) logging.getLogger('urllib3').setLevel(logging.WARNING) logging.getLogger('matplotlib').setLevel(logging.WARNING) config_json = "egs/visinger2/config.json" model_path = "G_157000.pth" hps = utils.get_hparams_from_file(config_json) net_g = SynthesizerTrn(hps) _ = net_g.eval() _ = utils.load_checkpoint(model_path, net_g, None) def vc_fn(speaker, ds, vc_transform): try: ds = json.loads(ds) except: return "工程文件json解析失败,请将ds文件的完整内容粘贴与此处", None dur = 0 flag = False try: for inp in ds: f0_seq = inp["f0_seq"] ph_dur = inp["ph_dur"] ph_dur= [float(i) for i in ph_dur.split(" ")] f0_seq = [float(i) for i in f0_seq.split(" ")] dur+=sum(ph_dur) print(sum(ph_dur)) if sum(ph_dur) >30: flag = True except: return "ds工程需要冻结f0和音素参数才能使用此模型合成", None if flag: return "单个切片时长必须小于30s,否则请使用本地推理", None if dur>120: return "总时长需要小于2分钟,否则请使用本地推理", None out_audio = infer_ds(net_g, hps, ds, speaker, vc_transform) # return "请上传小于45s的音频,需要转换长音频请本地进行转换", None # out_audio, out_sr = inference_main.infer(sid, out_wav_path, model_map[model], vc_transform) # _audio = out_audio.cpu().numpy() return "Success", (44100, out_audio.astype(np.float32)) app = gr.Blocks() with app: with gr.Tabs(): with gr.TabItem("Basic"): gr.Markdown(value=""" 这是visinger2 塔菲、电棍模型的在线demo, github 仓库地址是[visinger2-nomidi](https://github.com/innnky/VISinger2-nomidi) 由于训练集为录播数据全自动化制作,因此质量比较差,此模型并非visinger2的音质上限,最高质量模型效果请参照[VISinger2官方demo](https://zhangyongmao.github.io/VISinger2/) 其中ds工程文件为[DiffSinger](https://github.com/openvpi/DiffSinger)工程,需要通过[OpenSVIP](https://openvpi.github.io/) 转换器进行制作,原理是先通过别的歌声合成软件制作工程并转换为模型能够接受的输入格式。 由于此模型是nomidi模型,因此导出ds工程时需要冻结音素和音高参数, 否则会报错,具体DiffSinger工程制作详细问题可以加入DiffSinger QQ交流群: 见b站视频 [BV1be411N7JA](https://www.bilibili.com/video/BV1be411N7JA) 评论区 在线推理限制为总时长小于2分钟,且单个切片时长小于30s,有更大需求请下载本仓库或github仓库代码运行ds_inference.py进行本地推理 """) sid = gr.Dropdown(label="音色", choices=["taffy", "otto"], value="taffy") vc_input3 = gr.TextArea(label="ds工程(json格式)",value='''[ { "text": "SP 清 晨 SP", "ph_seq": "SP q ing ch en SP", "note_seq": "rest D4 D4 G4 G4 rest", "note_dur_seq": "0.6 0.273 0.273 0.4089999 0.4089999 0.4", "is_slur_seq": "0 0 0 0 0 0", "ph_dur": "0.469318 0.130682 0.120727 0.152273 0.409 0.4", "f0_timestep": "0.005", "f0_seq": "301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 302.0 302.4 301.9 301.4 300.5 299.4 299.0 298.3 297.9 297.6 297.2 297.2 297.0 296.8 296.9 296.7 296.6 296.8 296.9 296.9 297.4 297.6 297.7 298.2 298.5 298.3 298.6 298.7 298.5 298.6 298.3 297.8 296.4 293.9 291.5 286.7 283.2 279.6 278.5 283.4 288.4 293.5 298.6 303.9 309.3 314.7 320.3 325.9 331.7 337.5 343.5 349.5 355.7 362.0 368.3 374.8 381.5 387.1 388.7 391.3 393.6 396.1 397.7 398.7 399.3 399.6 399.8 399.4 399.0 398.6 397.9 397.7 397.1 396.7 396.1 396.0 395.4 395.6 395.7 395.9 395.9 396.1 396.4 396.8 397.0 397.3 397.5 397.5 397.5 397.7 397.7 397.7 397.7 397.9 397.7 397.7 397.7 397.7 397.7 397.7 397.5 397.5 397.2 397.0 397.0 396.7 396.6 396.6 396.5 396.3 396.3 396.1 396.1 396.3 396.3 396.1 396.3 396.3 396.4 396.6 396.7 396.6 396.9 397.2 396.8 397.4 397.9 398.0 398.5 399.1 399.1 399.1 399.0 398.7 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2", "input_type": "phoneme", "offset": 0.0 } ]''') vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0) # model = gr.Dropdown(label="模型", choices=list(model_map.keys()), value="G_34000.pth") vc_submit = gr.Button("合成", variant="primary") vc_output1 = gr.Textbox(label="Output Message") vc_output2 = gr.Audio(label="Output Audio") vc_submit.click(vc_fn, [sid, vc_input3, vc_transform], [vc_output1, vc_output2]) app.launch()