data_config: streaming: true validation_size_max: 1024 metadata_config: random_sample_metadata: true random_sample_metadata_calculate_size: 16384 random_sample_metadata_weights: html: 0.5 timestamp: 11.56111563110182 website_desc: 11.033764368362439 title: 1.0644297987874418 generation_datasource: 1.0 entity_paragraph: 11.077104653627899 metadata_list: - html - timestamp - website_description - title - url - datasource - length - entity_paragraph metadata_column_list: - html - timestamp - website_desc - title - generation_datasource - entity_paragraph local_metadata_special_tokens: entity_paragraph: entity metadata_sep: ' | ' metadata_key_value_sep: ': ' metadata_probability: 0.5 treat_local_metadata_as_regular_text: true add_local_metadata_special_tokens_in_prefix: true metadata_prefix_sep: ' |||' metadata_prefix_start_seq: '' max_seq_len: 1024 html_parser_config: all_tags_rules: attributes_to_keep: - class - id txt_max_chr_len: 0 txt_min_chr_len: -.inf tags_exceptions_to_txt_max_min_chr_len: - table - tr - th - td - colgroup - thead - tfoot - tbody tags_to_remove_alone_tag_name: - body tags_to_remove_alone_txt_max_chr_len: - .inf tags_to_remove_alone_txt_min_chr_len: - 0.0 local_metadata_special_token_start: entity_paragraph: local_metadata_special_token_end: entity_paragraph: ' ' experiment: with_metadata_datasetv2 per_device_eval_batch_size: 32 per_device_train_batch_size: 32 dataset_name: bs-modeling-metadata/c4-en-html-with-metadata dataset_config_name: null train_file: '*.jsonl.gz' validation_file: c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz overwrite_cache: false cache_dir: null extension: null preprocessing_num_workers: 48 validation_split_percentage: 5 block_size: null map_batch_size: 1 weight_decay: 0.01 learning_rate: 1.0e-05 num_train_epochs: 1 max_train_steps: 100000 lr_scheduler_type: linear num_warmup_steps: 6000 seed: 42 out_dir: /mnt/ssd-1/bigscience-metadata/lower-lr-2-lower-html-weight model_name: gpt2-xl project_name: metadata_lm jobid: '' start_with_eval: false extra_steps_to_eval_save_at: - 2 evaluation_strategy: STEPS eval_num_per_epoch: 3 eval_steps: 2000 save_strategy: STEPS save_num_per_epoch: 3 save_steps: 2000 do_train: true do_eval: true gradient_checkpointing: true resume_from_checkpoint_dir: null gradient_accumulation_steps: 1