ttj commited on
Commit
56c12ac
1 Parent(s): a4ed80e

Upload actual_config.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. actual_config.yaml +104 -0
actual_config.yaml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data_config:
2
+ streaming: true
3
+ validation_size_max: 1024
4
+ metadata_config:
5
+ random_sample_metadata: true
6
+ random_sample_metadata_calculate_size: 16384
7
+ random_sample_metadata_weights:
8
+ html: 0.5
9
+ timestamp: 11.56111563110182
10
+ website_desc: 11.033764368362439
11
+ title: 1.0644297987874418
12
+ generation_datasource: 1.0
13
+ entity_paragraph: 11.077104653627899
14
+ metadata_list:
15
+ - html
16
+ - timestamp
17
+ - website_description
18
+ - title
19
+ - url
20
+ - datasource
21
+ - length
22
+ - entity_paragraph
23
+ metadata_column_list:
24
+ - html
25
+ - timestamp
26
+ - website_desc
27
+ - title
28
+ - generation_datasource
29
+ - entity_paragraph
30
+ local_metadata_special_tokens:
31
+ entity_paragraph: entity
32
+ metadata_sep: ' | '
33
+ metadata_key_value_sep: ': '
34
+ metadata_probability: 0.5
35
+ treat_local_metadata_as_regular_text: true
36
+ add_local_metadata_special_tokens_in_prefix: true
37
+ metadata_prefix_sep: ' |||'
38
+ metadata_prefix_start_seq: ''
39
+ max_seq_len: 1024
40
+ html_parser_config:
41
+ all_tags_rules:
42
+ attributes_to_keep:
43
+ - class
44
+ - id
45
+ txt_max_chr_len: 0
46
+ txt_min_chr_len: -.inf
47
+ tags_exceptions_to_txt_max_min_chr_len:
48
+ - table
49
+ - tr
50
+ - th
51
+ - td
52
+ - colgroup
53
+ - thead
54
+ - tfoot
55
+ - tbody
56
+ tags_to_remove_alone_tag_name:
57
+ - body
58
+ tags_to_remove_alone_txt_max_chr_len:
59
+ - .inf
60
+ tags_to_remove_alone_txt_min_chr_len:
61
+ - 0.0
62
+ local_metadata_special_token_start:
63
+ entity_paragraph: <ENTITY_CHAIN>
64
+ local_metadata_special_token_end:
65
+ entity_paragraph: ' </ENTITY_CHAIN> '
66
+ experiment: with_metadata_datasetv2
67
+ per_device_eval_batch_size: 32
68
+ per_device_train_batch_size: 32
69
+ dataset_name: bs-modeling-metadata/c4-en-html-with-metadata
70
+ dataset_config_name: null
71
+ train_file: '*.jsonl.gz'
72
+ validation_file: c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz
73
+ overwrite_cache: false
74
+ cache_dir: null
75
+ extension: null
76
+ preprocessing_num_workers: 48
77
+ validation_split_percentage: 5
78
+ block_size: null
79
+ map_batch_size: 1
80
+ weight_decay: 0.01
81
+ learning_rate: 1.0e-05
82
+ num_train_epochs: 1
83
+ max_train_steps: 100000
84
+ lr_scheduler_type: linear
85
+ num_warmup_steps: 6000
86
+ seed: 42
87
+ out_dir: /mnt/ssd-1/bigscience-metadata/lower-lr-2-lower-html-weight
88
+ model_name: gpt2-xl
89
+ project_name: metadata_lm
90
+ jobid: ''
91
+ start_with_eval: false
92
+ extra_steps_to_eval_save_at:
93
+ - 2
94
+ evaluation_strategy: STEPS
95
+ eval_num_per_epoch: 3
96
+ eval_steps: 2000
97
+ save_strategy: STEPS
98
+ save_num_per_epoch: 3
99
+ save_steps: 2000
100
+ do_train: true
101
+ do_eval: true
102
+ gradient_checkpointing: true
103
+ resume_from_checkpoint_dir: null
104
+ gradient_accumulation_steps: 1