anton-l HF staff commited on
Commit
56717e3
1 Parent(s): 715ab66
Files changed (8) hide show
  1. .gitattributes +0 -0
  2. .gitignore +0 -0
  3. README.template.md +35 -3
  4. dataset_script.py +3 -3
  5. generate_datasets.py +5 -0
  6. languages.ftl +13 -1
  7. publish.py +3 -0
  8. test.py +1 -1
.gitattributes CHANGED
File without changes
.gitignore CHANGED
File without changes
README.template.md CHANGED
@@ -4,9 +4,9 @@ annotations_creators:
4
  - crowdsourced
5
  language_creators:
6
  - crowdsourced
7
- languages:
8
  {{LANGUAGES}}
9
- licenses:
10
  - cc0-1.0
11
  multilinguality:
12
  - multilingual
@@ -68,7 +68,7 @@ Take a look at the [Languages](https://commonvoice.mozilla.org/en/languages) pag
68
  ### Supported Tasks and Leaderboards
69
 
70
  The results for models trained on the Common Voice datasets are available via the
71
- [Papers with Code Leaderboards](https://paperswithcode.com/dataset/common-voice)
72
 
73
  ### Languages
74
 
@@ -142,6 +142,38 @@ The other data is data that has not yet been reviewed.
142
 
143
  The dev, test, train are all data that has been reviewed, deemed of high quality and split into dev, test and train.
144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  ## Dataset Creation
146
 
147
  ### Curation Rationale
 
4
  - crowdsourced
5
  language_creators:
6
  - crowdsourced
7
+ language_bcp47:
8
  {{LANGUAGES}}
9
+ license:
10
  - cc0-1.0
11
  multilinguality:
12
  - multilingual
 
68
  ### Supported Tasks and Leaderboards
69
 
70
  The results for models trained on the Common Voice datasets are available via the
71
+ [🤗 Speech Bench](https://huggingface.co/spaces/huggingface/hf-speech-bench)
72
 
73
  ### Languages
74
 
 
142
 
143
  The dev, test, train are all data that has been reviewed, deemed of high quality and split into dev, test and train.
144
 
145
+ ## Data Preprocessing Recommended by Hugging Face
146
+
147
+ The following are data preprocessing steps advised by the Hugging Face team. They are accompanied by an example code snippet that shows how to put them to practice.
148
+
149
+ Many examples in this dataset have trailing quotations marks, e.g _“the cat sat on the mat.“_. These trailing quotation marks do not change the actual meaning of the sentence, and it is near impossible to infer whether a sentence is a quotation or not a quotation from audio data alone. In these cases, it is advised to strip the quotation marks, leaving: _the cat sat on the mat_.
150
+
151
+ In addition, the majority of training sentences end in punctuation ( . or ? or ! ), whereas just a small proportion do not. In the dev set, **almost all** sentences end in punctuation. Thus, it is recommended to append a full-stop ( . ) to the end of the small number of training examples that do not end in punctuation.
152
+
153
+ ```python
154
+ from datasets import load_dataset
155
+
156
+ ds = load_dataset("mozilla-foundation/{{NAME}}", "en", use_auth_token=True)
157
+
158
+ def prepare_dataset(batch):
159
+ """Function to preprocess the dataset with the .map method"""
160
+ transcription = batch["sentence"]
161
+
162
+ if transcription.startswith('"') and transcription.endswith('"'):
163
+ # we can remove trailing quotation marks as they do not affect the transcription
164
+ transcription = transcription[1:-1]
165
+
166
+ if transcription[-1] not in [".", "?", "!"]:
167
+ # append a full-stop to sentences that do not end in punctuation
168
+ transcription = transcription + "."
169
+
170
+ batch["sentence"] = transcription
171
+
172
+ return batch
173
+
174
+ ds = ds.map(prepare_dataset, desc="preprocess dataset")
175
+ ```
176
+
177
  ## Dataset Creation
178
 
179
  ### Curation Rationale
dataset_script.py CHANGED
@@ -82,9 +82,9 @@ class CommonVoice(datasets.GeneratorBasedBuilder):
82
  release_date=STATS["date"],
83
  num_clips=lang_stats["clips"],
84
  num_speakers=lang_stats["users"],
85
- validated_hr=float(lang_stats["validHrs"]),
86
- total_hr=float(lang_stats["totalHrs"]),
87
- size_bytes=int(lang_stats["size"]),
88
  )
89
  for lang, lang_stats in STATS["locales"].items()
90
  ]
 
82
  release_date=STATS["date"],
83
  num_clips=lang_stats["clips"],
84
  num_speakers=lang_stats["users"],
85
+ validated_hr=float(lang_stats["validHrs"]) if lang_stats["validHrs"] else None,
86
+ total_hr=float(lang_stats["totalHrs"]) if lang_stats["totalHrs"] else None,
87
+ size_bytes=int(lang_stats["size"]) if lang_stats["size"] else None,
88
  )
89
  for lang, lang_stats in STATS["locales"].items()
90
  ]
generate_datasets.py CHANGED
@@ -49,6 +49,11 @@ VERSIONS = [
49
  "name": "common_voice_9_0",
50
  "release": "cv-corpus-9.0-2022-04-27",
51
  },
 
 
 
 
 
52
  ]
53
 
54
 
 
49
  "name": "common_voice_9_0",
50
  "release": "cv-corpus-9.0-2022-04-27",
51
  },
52
+ {
53
+ "semver": "10.0.0",
54
+ "name": "common_voice_10_0",
55
+ "release": "cv-corpus-10.0-2022-07-04",
56
+ },
57
  ]
58
 
59
 
languages.ftl CHANGED
@@ -29,6 +29,7 @@ da = Danish
29
  de = German
30
  dsb = Sorbian, Lower
31
  dv = Dhivehi
 
32
  el = Greek
33
  en = English
34
  eo = Esperanto
@@ -97,12 +98,16 @@ my = Burmese
97
  myv = Erzya
98
  nan-tw = Taiwanese (Minnan)
99
  nb-NO = Norwegian Bokmål
 
100
  ne-NP = Nepali
101
  nia = Nias
102
  nl = Dutch
103
  nn-NO = Norwegian Nynorsk
 
 
104
  nyn = Runyankole
105
  oc = Occitan
 
106
  or = Odia
107
  pa-IN = Punjabi
108
  pap-AW = Papiamento (Aruba)
@@ -128,6 +133,8 @@ sl = Slovenian
128
  so = Somali
129
  sq = Albanian
130
  sr = Serbian
 
 
131
  sv-SE = Swedish
132
  sw = Swahili
133
  syr = Syriac
@@ -139,8 +146,10 @@ ti = Tigrinya
139
  tig = Tigre
140
  tk = Turkmen
141
  tl = Tagalog
 
142
  tok = Toki Pona
143
  tr = Turkish
 
144
  tt = Tatar
145
  tw = Twi
146
  ty = Tahitian
@@ -150,12 +159,15 @@ ug = Uyghur
150
  uk = Ukrainian
151
  ur = Urdu
152
  uz = Uzbek
 
153
  vec = Venetian
154
  vi = Vietnamese
155
  vot = Votic
 
156
  yi = Yiddish
157
  yo = Yoruba
158
  yue = Cantonese
159
  zh-CN = Chinese (China)
160
  zh-HK = Chinese (Hong Kong)
161
- zh-TW = Chinese (Taiwan)
 
 
29
  de = German
30
  dsb = Sorbian, Lower
31
  dv = Dhivehi
32
+ dyu = Dioula
33
  el = Greek
34
  en = English
35
  eo = Esperanto
 
98
  myv = Erzya
99
  nan-tw = Taiwanese (Minnan)
100
  nb-NO = Norwegian Bokmål
101
+ nd = IsiNdebele (North)
102
  ne-NP = Nepali
103
  nia = Nias
104
  nl = Dutch
105
  nn-NO = Norwegian Nynorsk
106
+ nr = IsiNdebele (South)
107
+ nso = Northern Sotho
108
  nyn = Runyankole
109
  oc = Occitan
110
+ om = Afaan Ormoo
111
  or = Odia
112
  pa-IN = Punjabi
113
  pap-AW = Papiamento (Aruba)
 
133
  so = Somali
134
  sq = Albanian
135
  sr = Serbian
136
+ ss = Siswati
137
+ st = Southern Sotho
138
  sv-SE = Swedish
139
  sw = Swahili
140
  syr = Syriac
 
146
  tig = Tigre
147
  tk = Turkmen
148
  tl = Tagalog
149
+ tn = Setswana
150
  tok = Toki Pona
151
  tr = Turkish
152
+ ts = Xitsonga
153
  tt = Tatar
154
  tw = Twi
155
  ty = Tahitian
 
159
  uk = Ukrainian
160
  ur = Urdu
161
  uz = Uzbek
162
+ ve = Tshivenda
163
  vec = Venetian
164
  vi = Vietnamese
165
  vot = Votic
166
+ xh = Xhosa
167
  yi = Yiddish
168
  yo = Yoruba
169
  yue = Cantonese
170
  zh-CN = Chinese (China)
171
  zh-HK = Chinese (Hong Kong)
172
+ zh-TW = Chinese (Taiwan)
173
+ zu = Zulu
publish.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from huggingface_hub import create_repo
2
+
3
+ create_repo("mozilla-foundation/common_voice_10_0", repo_type="dataset")
test.py CHANGED
@@ -1,5 +1,5 @@
1
  from datasets import load_dataset
2
 
3
- dataset = load_dataset("./common_voice_9_0", "et", split="test", use_auth_token=True)
4
  print(dataset)
5
  print(dataset[100])
 
1
  from datasets import load_dataset
2
 
3
+ dataset = load_dataset("mozilla-foundation/common_voice_10_0", "et", split="test", use_auth_token=True)
4
  print(dataset)
5
  print(dataset[100])