anton-l
/

common_voice_generator

Model card Files Files and versions Community

anton-l HF staff commited on Jul 27, 2022

Commit

56717e3

•

1 Parent(s): 715ab66

add CV 10

Browse files

Files changed (8) hide show

.gitattributes +0 -0
.gitignore +0 -0
README.template.md +35 -3
dataset_script.py +3 -3
generate_datasets.py +5 -0
languages.ftl +13 -1
publish.py +3 -0
test.py +1 -1

.gitattributes CHANGED Viewed

File without changes

.gitignore CHANGED Viewed

File without changes

README.template.md CHANGED Viewed

@@ -4,9 +4,9 @@ annotations_creators:
 - crowdsourced
 language_creators:
 - crowdsourced
-languages:
 {{LANGUAGES}}
-licenses:
 - cc0-1.0
 multilinguality:
 - multilingual
@@ -68,7 +68,7 @@ Take a look at the [Languages](https://commonvoice.mozilla.org/en/languages) pag
 ### Supported Tasks and Leaderboards
 The results for models trained on the Common Voice datasets are available via the
-[Papers with Code Leaderboards](https://paperswithcode.com/dataset/common-voice)
 ### Languages
@@ -142,6 +142,38 @@ The other data is data that has not yet been reviewed.
 The dev, test, train are all data that has been reviewed, deemed of high quality and split into dev, test and train.
 ## Dataset Creation
 ### Curation Rationale

 - crowdsourced
 language_creators:
 - crowdsourced
+language_bcp47:
 {{LANGUAGES}}
+license:
 - cc0-1.0
 multilinguality:
 - multilingual
 ### Supported Tasks and Leaderboards
 The results for models trained on the Common Voice datasets are available via the
+[🤗 Speech Bench](https://huggingface.co/spaces/huggingface/hf-speech-bench)
 ### Languages
 The dev, test, train are all data that has been reviewed, deemed of high quality and split into dev, test and train.
+## Data Preprocessing Recommended by Hugging Face
+The following are data preprocessing steps advised by the Hugging Face team. They are accompanied by an example code snippet that shows how to put them to practice.
+Many examples in this dataset have trailing quotations marks, e.g _“the cat sat on the mat.“_. These trailing quotation marks do not change the actual meaning of the sentence, and it is near impossible to infer whether a sentence is a quotation or not a quotation from audio data alone. In these cases, it is advised to strip the quotation marks, leaving: _the cat sat on the mat_.
+In addition, the majority of training sentences end in punctuation ( . or ? or ! ), whereas just a small proportion do not. In the dev set, **almost all** sentences end in punctuation. Thus, it is recommended to append a full-stop ( . ) to the end of the small number of training examples that do not end in punctuation.
+```python
+from datasets import load_dataset
+ds = load_dataset("mozilla-foundation/{{NAME}}", "en", use_auth_token=True)
+def prepare_dataset(batch):
+  """Function to preprocess the dataset with the .map method"""
+  transcription = batch["sentence"]
+  if transcription.startswith('"') and transcription.endswith('"'):
+    # we can remove trailing quotation marks as they do not affect the transcription
+    transcription = transcription[1:-1]
+  if transcription[-1] not in [".", "?", "!"]:
+    # append a full-stop to sentences that do not end in punctuation
+    transcription = transcription + "."
+  batch["sentence"] = transcription
+  return batch
+ds = ds.map(prepare_dataset, desc="preprocess dataset")
+```
 ## Dataset Creation
 ### Curation Rationale

dataset_script.py CHANGED Viewed

@@ -82,9 +82,9 @@ class CommonVoice(datasets.GeneratorBasedBuilder):
             release_date=STATS["date"],
             num_clips=lang_stats["clips"],
             num_speakers=lang_stats["users"],
-            validated_hr=float(lang_stats["validHrs"]),
-            total_hr=float(lang_stats["totalHrs"]),
-            size_bytes=int(lang_stats["size"]),
         )
         for lang, lang_stats in STATS["locales"].items()
     ]

             release_date=STATS["date"],
             num_clips=lang_stats["clips"],
             num_speakers=lang_stats["users"],
+            validated_hr=float(lang_stats["validHrs"]) if lang_stats["validHrs"] else None,
+            total_hr=float(lang_stats["totalHrs"]) if lang_stats["totalHrs"] else None,
+            size_bytes=int(lang_stats["size"]) if lang_stats["size"] else None,
         )
         for lang, lang_stats in STATS["locales"].items()
     ]

generate_datasets.py CHANGED Viewed

@@ -49,6 +49,11 @@ VERSIONS = [
         "name": "common_voice_9_0",
         "release": "cv-corpus-9.0-2022-04-27",
     },
 ]

         "name": "common_voice_9_0",
         "release": "cv-corpus-9.0-2022-04-27",
     },
+    {
+        "semver": "10.0.0",
+        "name": "common_voice_10_0",
+        "release": "cv-corpus-10.0-2022-07-04",
+    },
 ]

languages.ftl CHANGED Viewed

@@ -29,6 +29,7 @@ da = Danish
 de = German
 dsb = Sorbian, Lower
 dv = Dhivehi
 el = Greek
 en = English
 eo = Esperanto
@@ -97,12 +98,16 @@ my = Burmese
 myv = Erzya
 nan-tw = Taiwanese (Minnan)
 nb-NO = Norwegian Bokmål
 ne-NP = Nepali
 nia = Nias
 nl = Dutch
 nn-NO = Norwegian Nynorsk
 nyn = Runyankole
 oc = Occitan
 or = Odia
 pa-IN = Punjabi
 pap-AW = Papiamento (Aruba)
@@ -128,6 +133,8 @@ sl = Slovenian
 so = Somali
 sq = Albanian
 sr = Serbian
 sv-SE = Swedish
 sw = Swahili
 syr = Syriac
@@ -139,8 +146,10 @@ ti = Tigrinya
 tig = Tigre
 tk = Turkmen
 tl = Tagalog
 tok = Toki Pona
 tr = Turkish
 tt = Tatar
 tw = Twi
 ty = Tahitian
@@ -150,12 +159,15 @@ ug = Uyghur
 uk = Ukrainian
 ur = Urdu
 uz = Uzbek
 vec = Venetian
 vi = Vietnamese
 vot = Votic
 yi = Yiddish
 yo = Yoruba
 yue = Cantonese
 zh-CN = Chinese (China)
 zh-HK = Chinese (Hong Kong)
-zh-TW = Chinese (Taiwan)

 de = German
 dsb = Sorbian, Lower
 dv = Dhivehi
+dyu = Dioula
 el = Greek
 en = English
 eo = Esperanto
 myv = Erzya
 nan-tw = Taiwanese (Minnan)
 nb-NO = Norwegian Bokmål
+nd = IsiNdebele (North)
 ne-NP = Nepali
 nia = Nias
 nl = Dutch
 nn-NO = Norwegian Nynorsk
+nr = IsiNdebele (South)
+nso = Northern Sotho
 nyn = Runyankole
 oc = Occitan
+om = Afaan Ormoo
 or = Odia
 pa-IN = Punjabi
 pap-AW = Papiamento (Aruba)
 so = Somali
 sq = Albanian
 sr = Serbian
+ss = Siswati
+st = Southern Sotho
 sv-SE = Swedish
 sw = Swahili
 syr = Syriac
 tig = Tigre
 tk = Turkmen
 tl = Tagalog
+tn = Setswana
 tok = Toki Pona
 tr = Turkish
+ts = Xitsonga
 tt = Tatar
 tw = Twi
 ty = Tahitian
 uk = Ukrainian
 ur = Urdu
 uz = Uzbek
+ve = Tshivenda
 vec = Venetian
 vi = Vietnamese
 vot = Votic
+xh = Xhosa
 yi = Yiddish
 yo = Yoruba
 yue = Cantonese
 zh-CN = Chinese (China)
 zh-HK = Chinese (Hong Kong)
+zh-TW = Chinese (Taiwan)
+zu = Zulu

publish.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from huggingface_hub import create_repo
2	+
3	+ create_repo("mozilla-foundation/common_voice_10_0", repo_type="dataset")

test.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from datasets import load_dataset
-dataset = load_dataset("./common_voice_9_0", "et", split="test", use_auth_token=True)
 print(dataset)
 print(dataset[100])

 from datasets import load_dataset
+dataset = load_dataset("mozilla-foundation/common_voice_10_0", "et", split="test", use_auth_token=True)
 print(dataset)
 print(dataset[100])