speecht5-vc / manifest /utils /prep_cmu_arctic_spkemb.py
wr
add manifest and pretrained vocoders
0233e7e
raw
history blame contribute delete
No virus
2.96 kB
import os
import glob
import numpy
import argparse
import torchaudio
from speechbrain.pretrained import EncoderClassifier
import torch
from tqdm import tqdm
import torch.nn.functional as F
spk_model = {
"speechbrain/spkrec-xvect-voxceleb": 512,
"speechbrain/spkrec-ecapa-voxceleb": 192,
}
def f2embed(wav_file, classifier, size_embed):
signal, fs = torchaudio.load(wav_file)
assert fs == 16000, fs
with torch.no_grad():
embeddings = classifier.encode_batch(signal)
embeddings = F.normalize(embeddings, dim=2)
embeddings = embeddings.squeeze().cpu().numpy()
assert embeddings.shape[0] == size_embed, embeddings.shape[0]
return embeddings
def process(args):
wavlst = []
for split in args.splits.split(","):
wav_dir = os.path.join(args.arctic_root, split)
wavlst_split = glob.glob(os.path.join(wav_dir, "wav", "*.wav"))
print(f"{split} {len(wavlst_split)} utterances.")
wavlst.extend(wavlst_split)
spkemb_root = args.output_root
if not os.path.exists(spkemb_root):
print(f"Create speaker embedding directory: {spkemb_root}")
os.mkdir(spkemb_root)
device = "cuda" if torch.cuda.is_available() else "cpu"
classifier = EncoderClassifier.from_hparams(source=args.speaker_embed, run_opts={"device": device}, savedir=os.path.join('/tmp', args.speaker_embed))
size_embed = spk_model[args.speaker_embed]
for utt_i in tqdm(wavlst, total=len(wavlst), desc="Extract"):
# TODO rename speaker embedding
utt_id = "-".join(utt_i.split("/")[-3:]).replace(".wav", "")
utt_emb = f2embed(utt_i, classifier, size_embed)
numpy.save(os.path.join(spkemb_root, f"{utt_id}.npy"), utt_emb)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--arctic-root", "-i", required=True, type=str, help="LibriTTS root directory.")
parser.add_argument("--output-root", "-o", required=True, type=str, help="Output directory.")
parser.add_argument("--speaker-embed", "-s", type=str, required=True, choices=["speechbrain/spkrec-xvect-voxceleb", "speechbrain/spkrec-ecapa-voxceleb"],
help="Pretrained model for extracting speaker emebdding.")
parser.add_argument("--splits", type=str, help="Split of four speakers seperate by comma.",
default="cmu_us_bdl_arctic,cmu_us_clb_arctic,cmu_us_rms_arctic,cmu_us_slt_arctic")
args = parser.parse_args()
print(f"Loading utterances from {args.arctic_root}/{args.splits}, "
+ f"Save speaker embedding 'npy' to {args.output_root}, "
+ f"Using speaker model {args.speaker_embed} with {spk_model[args.speaker_embed]} size.")
process(args)
if __name__ == "__main__":
"""
python utils/prep_cmu_arctic_spkemb.py \
-i /root/data/cmu_arctic/CMUARCTIC \
-o /root/data/cmu_arctic/CMUARCTIC/spkrec-xvect \
-s speechbrain/spkrec-xvect-voxceleb
"""
main()