anton-l HF staff commited on
Commit
568b7da
1 Parent(s): cdccd9a
Files changed (4) hide show
  1. README.template.md +2 -2
  2. generate_datasets.py +6 -0
  3. languages.ftl +8 -0
  4. test.py +1 -1
README.template.md CHANGED
@@ -131,7 +131,7 @@ Additional fields include `accent`, `age`, `client_id`, `up_votes`, `down_votes`
131
 
132
  The speech material has been subdivided into portions for dev, train, test, validated, invalidated, reported and other.
133
 
134
- The validated data is data that has been validated with reviewers and recieved upvotes that the data is of high quality.
135
 
136
  The invalidated data is data has been invalidated by reviewers
137
  and received downvotes indicating that the data is of low quality.
@@ -153,7 +153,7 @@ In addition, the majority of training sentences end in punctuation ( . or ? or !
153
  ```python
154
  from datasets import load_dataset
155
 
156
- ds = load_dataset("mozilla-foundation/{{NAME}}", "en", use_auth_token=True)
157
 
158
  def prepare_dataset(batch):
159
  """Function to preprocess the dataset with the .map method"""
 
131
 
132
  The speech material has been subdivided into portions for dev, train, test, validated, invalidated, reported and other.
133
 
134
+ The validated data is data that has been validated with reviewers and received upvotes that the data is of high quality.
135
 
136
  The invalidated data is data has been invalidated by reviewers
137
  and received downvotes indicating that the data is of low quality.
 
153
  ```python
154
  from datasets import load_dataset
155
 
156
+ ds = load_dataset("mozilla-foundation/{{DATASET_PATH}}", "en", use_auth_token=True)
157
 
158
  def prepare_dataset(batch):
159
  """Function to preprocess the dataset with the .map method"""
generate_datasets.py CHANGED
@@ -54,6 +54,11 @@ VERSIONS = [
54
  "name": "common_voice_10_0",
55
  "release": "cv-corpus-10.0-2022-07-04",
56
  },
 
 
 
 
 
57
  ]
58
 
59
 
@@ -102,6 +107,7 @@ def main():
102
  with open(f"README.template.md", "r") as fin:
103
  readme = fin.read()
104
  readme = readme.replace("{{NAME}}", release_stats["name"])
 
105
 
106
  locales = sorted(release_stats["locales"].keys())
107
  languages = [f"- {loc}" for loc in locales]
 
54
  "name": "common_voice_10_0",
55
  "release": "cv-corpus-10.0-2022-07-04",
56
  },
57
+ {
58
+ "semver": "11.0.0",
59
+ "name": "common_voice_11_0",
60
+ "release": "cv-corpus-11.0-2022-09-21",
61
+ },
62
  ]
63
 
64
 
 
107
  with open(f"README.template.md", "r") as fin:
108
  readme = fin.read()
109
  readme = readme.replace("{{NAME}}", release_stats["name"])
110
+ readme = readme.replace("{{DATASET_PATH}}", version["name"])
111
 
112
  locales = sorted(release_stats["locales"].keys())
113
  languages = [f"- {loc}" for loc in locales]
languages.ftl CHANGED
@@ -49,6 +49,7 @@ gom = Goan Konkani
49
  ha = Hausa
50
  he = Hebrew
51
  hi = Hindi
 
52
  hr = Croatian
53
  hsb = Sorbian, Upper
54
  ht = Haitian
@@ -63,6 +64,7 @@ is = Icelandic
63
  it = Italian
64
  izh = Izhorian
65
  ja = Japanese
 
66
  ka = Georgian
67
  kaa = Karakalpak
68
  kab = Kabyle
@@ -71,6 +73,7 @@ ki = Kikuyu
71
  kk = Kazakh
72
  km = Khmer
73
  kmr = Kurmanji Kurdish
 
74
  knn = Konkani (Devanagari)
75
  ko = Korean
76
  kpv = Komi-Zyrian
@@ -79,6 +82,8 @@ ky = Kyrgyz
79
  lb = Luxembourgish
80
  lg = Luganda
81
  lij = Ligurian
 
 
82
  lt = Lithuanian
83
  lv = Latvian
84
  mai = Maithili
@@ -125,11 +130,13 @@ sah = Sakha
125
  sat = Santali (Ol Chiki)
126
  sc = Sardinian
127
  scn = Sicilian
 
128
  shi = Shilha
129
  si = Sinhala
130
  sk = Slovak
131
  skr = Saraiki
132
  sl = Slovenian
 
133
  so = Somali
134
  sq = Albanian
135
  sr = Serbian
@@ -167,6 +174,7 @@ xh = Xhosa
167
  yi = Yiddish
168
  yo = Yoruba
169
  yue = Cantonese
 
170
  zh-CN = Chinese (China)
171
  zh-HK = Chinese (Hong Kong)
172
  zh-TW = Chinese (Taiwan)
 
49
  ha = Hausa
50
  he = Hebrew
51
  hi = Hindi
52
+ hil = Hiligaynon
53
  hr = Croatian
54
  hsb = Sorbian, Upper
55
  ht = Haitian
 
64
  it = Italian
65
  izh = Izhorian
66
  ja = Japanese
67
+ jbo = Lojban
68
  ka = Georgian
69
  kaa = Karakalpak
70
  kab = Kabyle
 
73
  kk = Kazakh
74
  km = Khmer
75
  kmr = Kurmanji Kurdish
76
+ kn = Kannada
77
  knn = Konkani (Devanagari)
78
  ko = Korean
79
  kpv = Komi-Zyrian
 
82
  lb = Luxembourgish
83
  lg = Luganda
84
  lij = Ligurian
85
+ ln = Lingala
86
+ lo = Lao
87
  lt = Lithuanian
88
  lv = Latvian
89
  mai = Maithili
 
130
  sat = Santali (Ol Chiki)
131
  sc = Sardinian
132
  scn = Sicilian
133
+ sdh = Southern Kurdish
134
  shi = Shilha
135
  si = Sinhala
136
  sk = Slovak
137
  skr = Saraiki
138
  sl = Slovenian
139
+ snk = Soninke
140
  so = Somali
141
  sq = Albanian
142
  sr = Serbian
 
174
  yi = Yiddish
175
  yo = Yoruba
176
  yue = Cantonese
177
+ zgh = Tamazight
178
  zh-CN = Chinese (China)
179
  zh-HK = Chinese (Hong Kong)
180
  zh-TW = Chinese (Taiwan)
test.py CHANGED
@@ -1,5 +1,5 @@
1
  from datasets import load_dataset
2
 
3
- dataset = load_dataset("mozilla-foundation/common_voice_10_0", "et", split="test", use_auth_token=True)
4
  print(dataset)
5
  print(dataset[100])
 
1
  from datasets import load_dataset
2
 
3
+ dataset = load_dataset("./common_voice_11_0", "et", split="test", use_auth_token=True)
4
  print(dataset)
5
  print(dataset[100])