CarrotAI/EXAONE-3.0-7.8B-Instruct-Llamafied-8k

작업해주신 maywell/EXAONE-3.0-7.8B-Instruct-Llamafied을 참고해서 변경했습니다. GPU 자원이 없으시면 사용하시면 됩니다.
올라간 모델은 8K 컨텍스트까지 지원하도록 설정을 변경하였습니다. (성능 미확인)
import torch
import gc

from transformers import LlamaConfig, LlamaForCausalLM, AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

def unload_model(model):
    del model
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def create_llama_config(exaone_config):
    return LlamaConfig(
        vocab_size=exaone_config.vocab_size,
        hidden_size=exaone_config.hidden_size,
        intermediate_size=exaone_config.intermediate_size,
        num_hidden_layers=exaone_config.num_layers,
        num_attention_heads=exaone_config.num_attention_heads,
        max_position_embeddings=exaone_config.max_position_embeddings,
        rms_norm_eps=exaone_config.layer_norm_epsilon,
        num_key_value_heads=exaone_config.num_key_value_heads,
        rope_theta=exaone_config.rope_theta,
        bos_token_id=exaone_config.bos_token_id,
        eos_token_id=exaone_config.eos_token_id,
        pad_token_id=exaone_config.pad_token_id,
        attention_bias=False,
    )

def copy_embedding_weights(llama_model, exaone_model):
    llama_model.model.embed_tokens.weight.data = exaone_model.transformer.wte.weight.data.to(torch.float16)

def copy_layer_weights(llama_layer, exaone_layer):
    # Self-attention
    llama_layer.self_attn.q_proj.weight.data = exaone_layer.attn.attention.q_proj.weight.data.to(torch.float16)
    llama_layer.self_attn.k_proj.weight.data = exaone_layer.attn.attention.k_proj.weight.data.to(torch.float16)
    llama_layer.self_attn.v_proj.weight.data = exaone_layer.attn.attention.v_proj.weight.data.to(torch.float16)
    llama_layer.self_attn.o_proj.weight.data = exaone_layer.attn.attention.out_proj.weight.data.to(torch.float16)
    # MLP
    llama_layer.mlp.gate_proj.weight.data = exaone_layer.mlp.c_fc_0.weight.data.to(torch.float16)
    llama_layer.mlp.up_proj.weight.data = exaone_layer.mlp.c_fc_1.weight.data.to(torch.float16)
    llama_layer.mlp.down_proj.weight.data = exaone_layer.mlp.c_proj.weight.data.to(torch.float16)
    # Layer Norms
    llama_layer.input_layernorm.weight.data = exaone_layer.ln_1.weight.data.to(torch.float16)
    llama_layer.post_attention_layernorm.weight.data = exaone_layer.ln_2.weight.data.to(torch.float16)

def copy_final_weights(llama_model, exaone_model):
    llama_model.model.norm.weight.data = exaone_model.transformer.ln_f.weight.data.to(torch.float16)
    llama_model.lm_head.weight.data = exaone_model.lm_head.weight.data.to(torch.float16)

def port_exaone_to_llama(exaone_model_path, llama_model_path):
    print("Loading EXAONE model and tokenizer...")
    exaone_model = AutoModelForCausalLM.from_pretrained(exaone_model_path, torch_dtype=torch.float16, device_map="cpu", trust_remote_code=True)
    exaone_tokenizer = AutoTokenizer.from_pretrained(exaone_model_path, trust_remote_code=True)
    exaone_config = exaone_model.config

    print("Creating Llama configuration...")
    llama_config = create_llama_config(exaone_config)

    print("Initializing Llama model...")
    llama_model = LlamaForCausalLM(llama_config)
    llama_model.to(torch.float16)
    llama_model.to('cpu')

    print("Copying weights...")
    with torch.no_grad():
        copy_embedding_weights(llama_model, exaone_model)

        for i in tqdm(range(exaone_config.num_layers), desc="Copying layers"):
            copy_layer_weights(llama_model.model.layers[i], exaone_model.transformer.h[i])
            if i % 10 == 0:  # Garbage collection every 10 layers
                gc.collect()
                if torch.cuda.is_available():
                    torch.cuda.empty_cache()

        copy_final_weights(llama_model, exaone_model)

    print("Unloading EXAONE model to free memory...")
    unload_model(exaone_model)

    print(f"Saving ported Llama model and tokenizer to {llama_model_path}")
    llama_model.save_pretrained(llama_model_path, safe_serialization=True, max_shard_size="1GB")
    exaone_tokenizer.save_pretrained(llama_model_path)

    print("Unloading Llama model...")
    unload_model(llama_model)

    print(f"EXAONE model successfully ported to Llama format and saved at {llama_model_path}")

if __name__ == "__main__":
    exaone_model_path = "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"
    llama_model_path = "./exa_llamafied"
    port_exaone_to_llama(exaone_model_path, llama_model_path)
모델을 공개해주신 LG AI Research분들께 감사의 말씀 드립니다.