neuralink HF staff commited on
Commit
0610800
1 Parent(s): b636cc7
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. checkpoint_metadata.json +9 -0
  2. config.yaml +150 -0
  3. lr_scheduler/lr_scheduler.pt +3 -0
  4. model/model/decoder/0/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
  5. model/model/decoder/0/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
  6. model/model/decoder/0/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
  7. model/model/decoder/0/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-3-of-4.safetensors +3 -0
  8. model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
  9. model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
  10. model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
  11. model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors +3 -0
  12. model/model/decoder/0/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
  13. model/model/decoder/0/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
  14. model/model/decoder/0/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
  15. model/model/decoder/0/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors +3 -0
  16. model/model/decoder/0/pp_block/input_layernorm/model_weight.safetensors +3 -0
  17. model/model/decoder/0/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
  18. model/model/decoder/0/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
  19. model/model/decoder/0/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
  20. model/model/decoder/0/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors +3 -0
  21. model/model/decoder/0/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
  22. model/model/decoder/0/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
  23. model/model/decoder/0/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
  24. model/model/decoder/0/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors +3 -0
  25. model/model/decoder/0/pp_block/post_attention_layernorm/model_weight.safetensors +3 -0
  26. model/model/decoder/1/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
  27. model/model/decoder/1/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
  28. model/model/decoder/1/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
  29. model/model/decoder/1/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-3-of-4.safetensors +3 -0
  30. model/model/decoder/1/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
  31. model/model/decoder/1/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
  32. model/model/decoder/1/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
  33. model/model/decoder/1/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors +3 -0
  34. model/model/decoder/1/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
  35. model/model/decoder/1/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
  36. model/model/decoder/1/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
  37. model/model/decoder/1/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors +3 -0
  38. model/model/decoder/1/pp_block/input_layernorm/model_weight.safetensors +3 -0
  39. model/model/decoder/1/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
  40. model/model/decoder/1/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
  41. model/model/decoder/1/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
  42. model/model/decoder/1/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors +3 -0
  43. model/model/decoder/1/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
  44. model/model/decoder/1/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
  45. model/model/decoder/1/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
  46. model/model/decoder/1/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors +3 -0
  47. model/model/decoder/1/pp_block/post_attention_layernorm/model_weight.safetensors +3 -0
  48. model/model/decoder/10/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-0-of-4.safetensors +3 -0
  49. model/model/decoder/10/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-1-of-4.safetensors +3 -0
  50. model/model/decoder/10/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-2-of-4.safetensors +3 -0
checkpoint_metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dp": 6,
3
+ "metas": {
4
+ "consumed_train_samples": 1920000,
5
+ "last_train_step": 20000
6
+ },
7
+ "tp": 4,
8
+ "version": "1.2"
9
+ }
config.yaml ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ checkpoints:
2
+ checkpoint_interval: 1000
3
+ checkpoints_path: /fsx/phuc/new_workspace/experiments/exp57_8b_llama_1024_ctx_length_and_64_segment_length_and_100k_bs_and_global_lr_1.0e-5_and_balance_factor_lr_0.01_and_balance_factor_0_weight_decay/checkpoints
4
+ checkpoints_path_is_shared_file_system: true
5
+ resume_checkpoint_path: /fsx/phuc/new_workspace/experiments/infini_attention_8b_llama/exp57_8b_llama_1024_ctx_length_and_64_segment_length_and_100k_bs_and_global_lr_1.0e-5_and_balance_factor_lr_0.01_and_balance_factor_0_weight_decay/checkpoints
6
+ save_initial_state: false
7
+ data:
8
+ dataset:
9
+ dataloader_type: single
10
+ dataset_max_tokens: null
11
+ dataset_weights:
12
+ - 0.3
13
+ - 0.3
14
+ - 0.45
15
+ - 0.15
16
+ - 0.08
17
+ - 0.02
18
+ datasets:
19
+ - dtype: uint32
20
+ filename_pattern: .*.ds
21
+ folder: s3://huggingface-llm-datasets/stack_full_v21-8k/tokenized-llama3/long/
22
+ skip_tokens: 0
23
+ - dtype: uint32
24
+ filename_pattern: .*.ds
25
+ folder: s3://huggingface-llm-datasets/stack_full_v21-8k/tokenized-llama3/short/
26
+ skip_tokens: 0
27
+ - dtype: uint32
28
+ filename_pattern: .*.ds
29
+ folder: s3://huggingface-llm-datasets/fineweb-v1-8k/tokenized-llama3/long/CC-MAIN-2024-10
30
+ skip_tokens: 0
31
+ - dtype: uint32
32
+ filename_pattern: .*.ds
33
+ folder: s3://huggingface-llm-datasets/fineweb-v1-8k/tokenized-llama3/short/CC-MAIN-2024-10
34
+ skip_tokens: 0
35
+ - dtype: uint32
36
+ filename_pattern: .*.ds
37
+ folder: s3://huggingface-llm-datasets/project-gutenberg/tokenized-llama3/
38
+ skip_tokens: 0
39
+ - dtype: uint32
40
+ filename_pattern: .*.ds
41
+ folder: s3://huggingface-llm-datasets/OpenHermes-2-5/tokenized-llama3
42
+ skip_tokens: 0
43
+ pad_samples_to_global_batch_size: false
44
+ skip_in_stream: true
45
+ num_loading_workers: 0
46
+ seed: 42
47
+ data_stages: null
48
+ experiment_logger:
49
+ tensorboard_logger:
50
+ flush_secs: 30
51
+ tensorboard_dir: /fsx/phuc/project_data/infini_attention/tb_logs
52
+ wandb_logger:
53
+ wandb_entity: null
54
+ wandb_project: infini_attention_8b_llama
55
+ general:
56
+ benchmark_csv_path: null
57
+ consumed_train_samples: 1920000
58
+ ignore_sanity_checks: true
59
+ project: infini_attention_8b_llama
60
+ run: exp57_8b_llama_1024_ctx_length_and_64_segment_length_and_100k_bs_and_global_lr_1.0e-5_and_balance_factor_lr_0.01_and_balance_factor_0_weight_decay
61
+ seed: 42
62
+ step: 20000
63
+ infini_attention:
64
+ balance_act_type: orig_sigmoid
65
+ balance_factor_lr: 0.01
66
+ balance_factor_weight_decay: 0.0
67
+ balance_init_type: zeros
68
+ log_grad: false
69
+ log_segment_acts: false
70
+ logging: true
71
+ logging_interval: 250
72
+ segment_length: 64
73
+ turn_on_memory: true
74
+ kill_switch_path: null
75
+ lighteval: null
76
+ logging:
77
+ iteration_step_info_interval: 1
78
+ log_level: info
79
+ log_level_replica: info
80
+ model:
81
+ ddp_bucket_cap_mb: 25
82
+ dtype: bfloat16
83
+ init_method:
84
+ path: /fsx/phuc/projects/infini-attention/llama3-ckps/haojun-8b-llama-nanotron-ckp/NanotronLlama3-8B
85
+ make_vocab_size_divisible_by: 1
86
+ model_config:
87
+ bos_token_id: 128000
88
+ eos_token_id: 128001
89
+ hidden_act: silu
90
+ hidden_size: 4096
91
+ initializer_range: 0.02
92
+ intermediate_size: 14336
93
+ is_llama_config: true
94
+ max_position_embeddings: 8192
95
+ num_attention_heads: 32
96
+ num_hidden_layers: 32
97
+ num_key_value_heads: 8
98
+ pad_token_id: null
99
+ pretraining_tp: 1
100
+ rms_norm_eps: 1.0e-05
101
+ rope_interleaved: false
102
+ rope_scaling: null
103
+ rope_theta: 500000.0
104
+ tie_word_embeddings: false
105
+ use_cache: true
106
+ vocab_size: 128256
107
+ optimizer:
108
+ accumulate_grad_in_fp32: false
109
+ adam_beta1: 0.9
110
+ adam_beta2: 0.95
111
+ adam_eps: 1.0e-08
112
+ clip_grad: 1.0
113
+ learning_rate_scheduler:
114
+ learning_rate: 1.0e-05
115
+ lr_decay_starting_step: null
116
+ lr_decay_steps: 23500
117
+ lr_decay_style: cosine
118
+ lr_warmup_steps: 1500
119
+ lr_warmup_style: linear
120
+ min_decay_lr: 1.0e-06
121
+ torch_adam_is_fused: true
122
+ weight_decay: 0.1
123
+ zero_stage: 0
124
+ parallelism:
125
+ dp: 6
126
+ expert_parallel_size: 1
127
+ pp: 1
128
+ pp_engine: 1f1b
129
+ tp: 4
130
+ tp_linear_async_communication: false
131
+ tp_mode: ALL_REDUCE
132
+ profiler: null
133
+ s3_upload:
134
+ remove_after_upload: true
135
+ s5cmd_concurrency: 5
136
+ s5cmd_numworkers: 16
137
+ s5cmd_path: null
138
+ upload_s3_path: s3://phuc-experiments/infini-attention/8b-llama/exp57_8b_llama_1024_ctx_length_and_64_segment_length_and_100k_bs_and_global_lr_1.0e-5_and_balance_factor_lr_0.01_and_balance_factor_0_weight_decay
139
+ tokenizer:
140
+ tokenizer_max_length: null
141
+ tokenizer_name_or_path: /fsx/haojun/lighteval_evaluation_model/NanotronLlama3-8B
142
+ tokenizer_revision: null
143
+ tokens:
144
+ batch_accumulation_per_replica: 1
145
+ limit_test_batches: 0
146
+ limit_val_batches: 0
147
+ micro_batch_size: 16
148
+ sequence_length: 1024
149
+ train_steps: 25000
150
+ val_check_interval: -1
lr_scheduler/lr_scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:737facbe7635b84da684cbb0920e1e12cbfa59d865027e3d29946e1da7fcb6c9
3
+ size 5812
model/model/decoder/0/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-0-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd3cd7ea1b37e9d8245500104946a1f9beda585b582c3c6c86417f2143e62c0a
3
+ size 200
model/model/decoder/0/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-1-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5ecce917758b783cf75bcfba65bb98e2598b47fa369e79e97217dac514fe7cb
3
+ size 200
model/model/decoder/0/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-2-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4eca6ff9837681e57b8b7359a0b1450a2a2faaa217f191343a224bcfa4bac2d
3
+ size 200
model/model/decoder/0/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-3-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70e26e50995d04b476cef24f25fe3b123db242b79d2b26721e958a27a94e95c3
3
+ size 200
model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afbb7fafb264594a507fe03060a966b91335e60401aea3f3531c9036a37bdc2b
3
+ size 8388848
model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd9505ee95b8f6e228216d63826f095d1d2bd704c090a9496b0cd204b5dc3cc7
3
+ size 8388848
model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f4f77e670759e11cca92e8eb1a4ca8cb1d997cbe1a3c7ec44097d91704dec79
3
+ size 8388848
model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20f2977cc2ccc9df4828d9170c829143ac84e7bf1a10a17dbcff03b8e7d2b9c4
3
+ size 8388848
model/model/decoder/0/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b85fff5b4969a14a90d3251beea89a4f58b3476951dc4d80842fbc42859551a6
3
+ size 12583264
model/model/decoder/0/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c906c130138839577b399f263d2cd0377c684161cab8eb9db82cf4f30e178fa1
3
+ size 12583272
model/model/decoder/0/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:691718e5119e0a7115a84454c6ed9eafd768b9b9783f747c698059db2233224e
3
+ size 12583272
model/model/decoder/0/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b315f2fdabb2b6e6aaa65bcbc1e77d7c307f50b20de2b3f55501e3c9e355884
3
+ size 12583272
model/model/decoder/0/pp_block/input_layernorm/model_weight.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:839be36eddaad9760a68863d4618402af1893620b281a05ff0ff9e7cfe0ed802
3
+ size 8288
model/model/decoder/0/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d638b1dbd9ab13fbfbeb47885a43ae81679b199c18c0cbaed3f202cf4c36942
3
+ size 29360368
model/model/decoder/0/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a030aaac659587f67dca0efc2b49aaa8b8736eed1906d2f5980529f6c7fe45c1
3
+ size 29360368
model/model/decoder/0/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5c763bab5ae055a0f8d095329c60c4ad64f0d05a52db890ed860d003f0f14ee
3
+ size 29360368
model/model/decoder/0/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4001b487aa10205cc4b4a335c2df5dbcf2b6cf692a19b90e1e6346a6adb25df5
3
+ size 29360368
model/model/decoder/0/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eafbd550659acf898eea0fab8867281ce636543eff7f13c40f32f9028bda67d2
3
+ size 58720552
model/model/decoder/0/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3db63345724e44faf6ba407334ecc96348ba03e0192f9f60550518368e87b6ac
3
+ size 58720560
model/model/decoder/0/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81385d8ec0ae77104655fbb1f5dbf799f70466a32f708c04cb47080b2d46d3e3
3
+ size 58720560
model/model/decoder/0/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9ba738a34d22e416b1db4bded73974c726edb399b284d23eceb2ef16da5dd06
3
+ size 58720560
model/model/decoder/0/pp_block/post_attention_layernorm/model_weight.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:835706c82a03cc5e4a4c3879f76dde2c8873efd2e11c7e6d39789798af173773
3
+ size 8288
model/model/decoder/1/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-0-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10aec76847af0d3b72cbf894d8ef1629db51ddf0721f2e9a839d7294e94b25ba
3
+ size 200
model/model/decoder/1/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-1-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44e7dc8301e6cac7b9cff6fa11dc95222e33c275101952f0fe8096d6e78927db
3
+ size 200
model/model/decoder/1/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-2-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ced99e5e02b4b2df6734a354a1bb1eee1ee32e0f7d71c00c15ba50349c520bc
3
+ size 200
model/model/decoder/1/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-3-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62cc82e79b26a0311182c0a0bc0e565d89e7205950536ff511a4190dfe5b4cb2
3
+ size 200
model/model/decoder/1/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2fbb9870ae118de4d8225df2d0111916f337e87f6ffb171e6e99e942a6f5e84
3
+ size 8388848
model/model/decoder/1/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99382bce346eb0aa855de26305cc64d581c209306867ecfe665209e142724cd1
3
+ size 8388848
model/model/decoder/1/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df614829f243bc8d2c4ba3deda49e8a9bab6ced8cd2e63c46b08fdae176ab6c6
3
+ size 8388848
model/model/decoder/1/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a42aec6f5c07a392ebfc588f67c9b8713c3f1cbf1dad988c3bb7ae0ed72a4d47
3
+ size 8388848
model/model/decoder/1/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8005058e37aa101f31327e80c1d3a2646a3303acab2e21f6e78c0f4f72f01495
3
+ size 12583264
model/model/decoder/1/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef2b1fe9ec243434a30e0d2aef93794229ca86157146e79a8037b144494246b0
3
+ size 12583272
model/model/decoder/1/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92f89cc57b8db928d5dd5b2baf9422acf4a6fcb5b13d7c57a63fb6891bd68bcf
3
+ size 12583272
model/model/decoder/1/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3feca5b544559386a355f31d3658237169eb8c170523fdac2047c0aec838a8b6
3
+ size 12583272
model/model/decoder/1/pp_block/input_layernorm/model_weight.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3e247f0ff9fef5db3801019d29cbab0e939f9811bc87948dbf2ffccf3c804c7
3
+ size 8288
model/model/decoder/1/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5de65ac6d4af028cfa691029645ba2cdbd80e8717789505e5850c5978807256b
3
+ size 29360368
model/model/decoder/1/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9d8b4af82370734fa7f9c2a99958a732b8face2142d04ddd7d0b7321eb7af71
3
+ size 29360368
model/model/decoder/1/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c30074512f3a8e912c0f3efd0546d3049d42f2781625ee29184de6b8d2f2b55
3
+ size 29360368
model/model/decoder/1/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:111a16b5ed3e23b81976ccee35575e1cb2d7a17f675b9f448315cf813642b157
3
+ size 29360368
model/model/decoder/1/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:374da23673af8dde22742199fb2c504849f54ff68978d13e856b3d66f7e1233b
3
+ size 58720552
model/model/decoder/1/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-1-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afd78bf516b96f51ead6a14a5821cec58857c46c98c6ed1bc99924ca4bf9c67b
3
+ size 58720560
model/model/decoder/1/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-2-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23e758214cc2fd7c534e5face960fe4123b37d76889d95eec0d548f3734a54ad
3
+ size 58720560
model/model/decoder/1/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-3-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e5d9e4777df66e6bd1affbdb44ed554e66913b0077b83d8551c18979fd5089e
3
+ size 58720560
model/model/decoder/1/pp_block/post_attention_layernorm/model_weight.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:915cdab464967593881825306bb37d3565c1463d54c596606f4756d8b4f3023b
3
+ size 8288
model/model/decoder/10/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-0-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0800aeaafa9b470cde1e420ebf24a853846755269f90f9b0a54316e4c0666ef9
3
+ size 200
model/model/decoder/10/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-1-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b37c0e48385b0aff19585ad996520319d194b68d8c5bef9a0255b35db6391f19
3
+ size 200
model/model/decoder/10/pp_block/attn/model_balance_factors_pp-rank-0-of-1_tp-rank-2-of-4.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f81fafc188236802bae75d2f0309ed3988f9d0c7f87a6d4689670be8b41cf8a0
3
+ size 200