wa007 commited on
Commit
28988ce
1 Parent(s): 0ed093a
README.md CHANGED
@@ -1,3 +1,30 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ tags:
4
+ - audio
5
+ - text-to-speech
6
+ - instant-voice-cloning
7
+ language:
8
+ - en
9
+ - zh
10
+ inference: false
11
+ ---
12
+
13
+ # OpenVoice
14
+ OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
15
+
16
+ <video controls autoplay src="https://cdn-uploads.huggingface.co/production/uploads/641de0213239b631552713e4/uCHTHD9OUotgOflqDu3QK.mp4"></video>
17
+
18
+ ### Features
19
+ - **Accurate Tone Color Cloning.** OpenVoice can accurately clone the reference tone color and generate speech in multiple languages and accents.
20
+ - **Flexible Voice Style Control.** OpenVoice enables granular control over voice styles, such as emotion and accent, as well as other style parameters including rhythm, pauses, and intonation.
21
+ - **Zero-shot Cross-lingual Voice Cloning.** Neither of the language of the generated speech nor the language of the reference speech needs to be presented in the massive-speaker multi-lingual training dataset.
22
+
23
+ ### How to Use
24
+ Please see [usage](https://github.com/myshell-ai/OpenVoice/blob/main/docs/USAGE.md) for detailed instructions.
25
+
26
+ ### Links
27
+ - [Github](https://github.com/myshell-ai/OpenVoice)
28
+ - [HFDemo](https://huggingface.co/spaces/myshell-ai/OpenVoice)
29
+ - [Discord](https://discord.gg/myshell)
30
+
checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89ae83aa4e3668fef64b388b789ff7b0ce0def9f801069edfc18a00ea420748d
3
+ size 131327338
checkpoints/base_speakers/EN/checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1db1ae1a5c8ded049bd1536051489aefbfad4a5077c01c2257e9e88fa1bb8422
3
+ size 160467309
checkpoints/base_speakers/EN/config.json ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data": {
3
+ "text_cleaners": [
4
+ "cjke_cleaners2"
5
+ ],
6
+ "sampling_rate": 22050,
7
+ "filter_length": 1024,
8
+ "hop_length": 256,
9
+ "win_length": 1024,
10
+ "n_mel_channels": 80,
11
+ "add_blank": true,
12
+ "cleaned_text": true,
13
+ "n_speakers": 10
14
+ },
15
+ "model": {
16
+ "inter_channels": 192,
17
+ "hidden_channels": 192,
18
+ "filter_channels": 768,
19
+ "n_heads": 2,
20
+ "n_layers": 6,
21
+ "n_layers_trans_flow": 3,
22
+ "kernel_size": 3,
23
+ "p_dropout": 0.1,
24
+ "resblock": "1",
25
+ "resblock_kernel_sizes": [
26
+ 3,
27
+ 7,
28
+ 11
29
+ ],
30
+ "resblock_dilation_sizes": [
31
+ [
32
+ 1,
33
+ 3,
34
+ 5
35
+ ],
36
+ [
37
+ 1,
38
+ 3,
39
+ 5
40
+ ],
41
+ [
42
+ 1,
43
+ 3,
44
+ 5
45
+ ]
46
+ ],
47
+ "upsample_rates": [
48
+ 8,
49
+ 8,
50
+ 2,
51
+ 2
52
+ ],
53
+ "upsample_initial_channel": 512,
54
+ "upsample_kernel_sizes": [
55
+ 16,
56
+ 16,
57
+ 4,
58
+ 4
59
+ ],
60
+ "n_layers_q": 3,
61
+ "use_spectral_norm": false,
62
+ "gin_channels": 256
63
+ },
64
+ "symbols": [
65
+ "_",
66
+ ",",
67
+ ".",
68
+ "!",
69
+ "?",
70
+ "-",
71
+ "~",
72
+ "\u2026",
73
+ "N",
74
+ "Q",
75
+ "a",
76
+ "b",
77
+ "d",
78
+ "e",
79
+ "f",
80
+ "g",
81
+ "h",
82
+ "i",
83
+ "j",
84
+ "k",
85
+ "l",
86
+ "m",
87
+ "n",
88
+ "o",
89
+ "p",
90
+ "s",
91
+ "t",
92
+ "u",
93
+ "v",
94
+ "w",
95
+ "x",
96
+ "y",
97
+ "z",
98
+ "\u0251",
99
+ "\u00e6",
100
+ "\u0283",
101
+ "\u0291",
102
+ "\u00e7",
103
+ "\u026f",
104
+ "\u026a",
105
+ "\u0254",
106
+ "\u025b",
107
+ "\u0279",
108
+ "\u00f0",
109
+ "\u0259",
110
+ "\u026b",
111
+ "\u0265",
112
+ "\u0278",
113
+ "\u028a",
114
+ "\u027e",
115
+ "\u0292",
116
+ "\u03b8",
117
+ "\u03b2",
118
+ "\u014b",
119
+ "\u0266",
120
+ "\u207c",
121
+ "\u02b0",
122
+ "`",
123
+ "^",
124
+ "#",
125
+ "*",
126
+ "=",
127
+ "\u02c8",
128
+ "\u02cc",
129
+ "\u2192",
130
+ "\u2193",
131
+ "\u2191",
132
+ " "
133
+ ],
134
+ "speakers": {
135
+ "default": 1,
136
+ "whispering": 2,
137
+ "shouting": 3,
138
+ "excited": 4,
139
+ "cheerful": 5,
140
+ "terrified": 6,
141
+ "angry": 7,
142
+ "sad": 8,
143
+ "friendly": 9
144
+ }
145
+ }
checkpoints/base_speakers/EN/en_default_se.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cab24002eec738d0fe72cb73a34e57fbc3999c1bd4a1670a7b56ee4e3590ac9
3
+ size 1789
checkpoints/base_speakers/EN/en_style_se.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f698153be5004b90a8642d1157c89cae7dd296752a3276450ced6a17b8b98a9
3
+ size 1783
checkpoints/base_speakers/ZH/checkpoint.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de9fb0eb749f3254130fe0172fcbb20e75f88a9b16b54dd0b73cac0dc40da7d9
3
+ size 160467309
checkpoints/base_speakers/ZH/config.json ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data": {
3
+ "text_cleaners": [
4
+ "cjke_cleaners2"
5
+ ],
6
+ "sampling_rate": 22050,
7
+ "filter_length": 1024,
8
+ "hop_length": 256,
9
+ "win_length": 1024,
10
+ "n_mel_channels": 80,
11
+ "add_blank": true,
12
+ "cleaned_text": true,
13
+ "n_speakers": 10
14
+ },
15
+ "model": {
16
+ "inter_channels": 192,
17
+ "hidden_channels": 192,
18
+ "filter_channels": 768,
19
+ "n_heads": 2,
20
+ "n_layers": 6,
21
+ "n_layers_trans_flow": 3,
22
+ "kernel_size": 3,
23
+ "p_dropout": 0.1,
24
+ "resblock": "1",
25
+ "resblock_kernel_sizes": [
26
+ 3,
27
+ 7,
28
+ 11
29
+ ],
30
+ "resblock_dilation_sizes": [
31
+ [
32
+ 1,
33
+ 3,
34
+ 5
35
+ ],
36
+ [
37
+ 1,
38
+ 3,
39
+ 5
40
+ ],
41
+ [
42
+ 1,
43
+ 3,
44
+ 5
45
+ ]
46
+ ],
47
+ "upsample_rates": [
48
+ 8,
49
+ 8,
50
+ 2,
51
+ 2
52
+ ],
53
+ "upsample_initial_channel": 512,
54
+ "upsample_kernel_sizes": [
55
+ 16,
56
+ 16,
57
+ 4,
58
+ 4
59
+ ],
60
+ "n_layers_q": 3,
61
+ "use_spectral_norm": false,
62
+ "gin_channels": 256
63
+ },
64
+ "symbols": [
65
+ "_",
66
+ ",",
67
+ ".",
68
+ "!",
69
+ "?",
70
+ "-",
71
+ "~",
72
+ "\u2026",
73
+ "N",
74
+ "Q",
75
+ "a",
76
+ "b",
77
+ "d",
78
+ "e",
79
+ "f",
80
+ "g",
81
+ "h",
82
+ "i",
83
+ "j",
84
+ "k",
85
+ "l",
86
+ "m",
87
+ "n",
88
+ "o",
89
+ "p",
90
+ "s",
91
+ "t",
92
+ "u",
93
+ "v",
94
+ "w",
95
+ "x",
96
+ "y",
97
+ "z",
98
+ "\u0251",
99
+ "\u00e6",
100
+ "\u0283",
101
+ "\u0291",
102
+ "\u00e7",
103
+ "\u026f",
104
+ "\u026a",
105
+ "\u0254",
106
+ "\u025b",
107
+ "\u0279",
108
+ "\u00f0",
109
+ "\u0259",
110
+ "\u026b",
111
+ "\u0265",
112
+ "\u0278",
113
+ "\u028a",
114
+ "\u027e",
115
+ "\u0292",
116
+ "\u03b8",
117
+ "\u03b2",
118
+ "\u014b",
119
+ "\u0266",
120
+ "\u207c",
121
+ "\u02b0",
122
+ "`",
123
+ "^",
124
+ "#",
125
+ "*",
126
+ "=",
127
+ "\u02c8",
128
+ "\u02cc",
129
+ "\u2192",
130
+ "\u2193",
131
+ "\u2191",
132
+ " "
133
+ ],
134
+ "speakers": {
135
+ "default": 0
136
+ }
137
+ }
checkpoints/base_speakers/ZH/zh_default_se.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b62e8264962059b8a84dd00b29e2fcccc92f5d3be90eec67dfa082c0cf58ccf
3
+ size 1789
config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data": {
3
+ "sampling_rate": 22050,
4
+ "filter_length": 1024,
5
+ "hop_length": 256,
6
+ "win_length": 1024,
7
+ "n_speakers": 0
8
+ },
9
+ "model": {
10
+ "inter_channels": 192,
11
+ "hidden_channels": 192,
12
+ "filter_channels": 768,
13
+ "n_heads": 2,
14
+ "n_layers": 6,
15
+ "kernel_size": 3,
16
+ "p_dropout": 0.1,
17
+ "resblock": "1",
18
+ "resblock_kernel_sizes": [
19
+ 3,
20
+ 7,
21
+ 11
22
+ ],
23
+ "resblock_dilation_sizes": [
24
+ [
25
+ 1,
26
+ 3,
27
+ 5
28
+ ],
29
+ [
30
+ 1,
31
+ 3,
32
+ 5
33
+ ],
34
+ [
35
+ 1,
36
+ 3,
37
+ 5
38
+ ]
39
+ ],
40
+ "upsample_rates": [
41
+ 8,
42
+ 8,
43
+ 2,
44
+ 2
45
+ ],
46
+ "upsample_initial_channel": 512,
47
+ "upsample_kernel_sizes": [
48
+ 16,
49
+ 16,
50
+ 4,
51
+ 4
52
+ ],
53
+ "n_layers_q": 3,
54
+ "use_spectral_norm": false,
55
+ "gin_channels": 256
56
+ }
57
+ }