Attn,mlp fp16+1bit

FasterDecoding · pingbowen23 · Mar 4, 2024 · Mar 12, 2024 · Mar 14, 2024 · Mar 16, 2024
commit 6836149e19514a49132f904604fc8e84d11c6c35
diff --git a/bitdelta/diff2.py b/bitdelta/diff2.py
@@ -64,16 +64,22 @@ def compress_submodule(name, subname, module, submodule):
         torch.cuda.empty_cache()
         setattr(module, subname, compressed)
 
-    # TODO: this can be parallelized
+    # TODO: 根据thresh 选择压缩比例
     for name, module in finetuned_compressed_model.named_modules():
-
-        if "self_attn" in name:
+        if "self_attn" in name or "mlp" in name:
             for subname, submodule in module.named_children():
                 if "proj" in subname:
                     base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
                     finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
-                    # compress_submodule(name, subname, module, submodule)
-                    U,S,V = decomposition(finetuned_weight - base_weight,dim=1024)
+                    dim , thresh = 1024,0.7
+
+                    if "mlp" in name:
+                        dim , thresh = 2048 , 0.24
+
+                    U,S,V = decomposition(finetuned_weight - base_weight,dim=dim)
+                    energy_total = torch.sum(S**2)
+                    energy_top_percent = torch.sum(S[:50]**2)
+                    ratio = energy_top_percent / energy_total
 
                     compressed_U, compressed_V = BinaryDiff(weight=U[:,64:]).to(finetuned_weight.device), BinaryDiff(weight=V[:,64:]).to(finetuned_weight.device)
                     U_mask, U_coeff, V_mask, V_coeff = compressed_U.mask, compressed_U.coeff, compressed_V.mask, compressed_V.coeff
@@ -82,19 +88,7 @@ def compress_submodule(name, subname, module, submodule):
                     U[:,64:] , V[:,64:] = weight_U.T, weight_V.T   # 不确定是否有bug
                     delta = U @ torch.diag(S) @ V.t()
                     with torch.no_grad():
-                        finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(torch.bfloat16)) 
-
-
-        elif "mlp" in name:
-            with torch.no_grad():
-                for subname, submodule in module.named_children():
-                    if "proj" in subname:
-                        base_weight = base_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
-                        finetuned_weight = finetuned_model.get_submodule(f"{name}.{subname}").weight.detach().to(submodule.weight.device)
-                        U,S,V = decomposition(finetuned_weight - base_weight,dim=int(128 * 1.45)) 
-                        delta = torch.mm(torch.mm(U, torch.diag(S)), V.t())
-                        finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(torch.bfloat16))
-
+                        finetuned_model.get_submodule(f"{name}.{subname}").weight.copy_(base_weight + delta.to(base_weight.dtype)) 
 
     finetuned_model.save_pretrained(save_dir)
 

diff --git a/cosine_sim_check.py b/cosine_sim_check.py
@@ -0,0 +1,281 @@
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
+import torch
+from torch import nn
+import gc
+import torch.nn.functional as F
+from bitdelta.diff import  save_diff, save_full_model
+from bitdelta.misc import find_corr_stddev
+from bitdelta.binary_gemm_kernel import pack, unpack, binary_bmm
+from bitdelta.utils import get_model, parse_args, get_tokenizer
+from tqdm import tqdm
+from bitdelta.data import get_dataset, get_dataloader
+
+import json
+import transformers
+
+import re
+import random
+import numpy as np
+
+def set_random_seed(seed: int = 0):
+    """
+    set random seed
+    :param seed: int, random seed
+    :return:
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+
+set_random_seed(seed=0)
+
+def get_param_names_to_merge(input_param_names: list, exclude_param_names_regex: list):
+    """
+    get the names of parameters that need to be merged
+    :param input_param_names: list, names of input parameters
+    :param exclude_param_names_regex: list, regular expression of names of parameters that need to be excluded
+    :return:
+    """
+    param_names_to_merge = []
+    for param_name in input_param_names:
+        exclude = any([re.match(exclude_pattern, param_name) for exclude_pattern in exclude_param_names_regex])
+        if not exclude:
+            param_names_to_merge.append(param_name)
+    return param_names_to_merge
+
+
+def get_model(model_path):
+    if "mistral" in model_path or "mixtral" in model_path:
+        data_type = torch.bfloat16
+    else:
+        data_type = torch.float16
+    with torch.no_grad():
+        model = transformers.AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype=data_type,
+            low_cpu_mem_usage=True,
+            # device_map="auto"   
+        ).to("cuda")
+    return model
+
+
+
+
+def singular_values_for_variance(tensor, variances=[0.9, 0.95]):
+    """
+    Calculate the minimum number of singular values needed to reach specified variance ratios.
+
+    Parameters:
+    - tensor: A 2D tensor for which to calculate the SVD.
+    - variances: A list of variance ratios to calculate the minimum number of singular values for.
+
+    Returns:
+    A dictionary with the variance ratios as keys and the minimum number of singular values needed as values.
+    """
+    # Compute SVD
+    U, S, V = torch.svd(tensor)
+    # Calculate the squared singular values (proportional to variance explained)
+    squared_singular_values = torch.pow(S, 2)
+    total_variance = torch.sum(squared_singular_values)
+    cumulative_variance_ratios = torch.cumsum(squared_singular_values, dim=0) / total_variance
+
+    # Find the minimum number of singular values for each specified variance
+    results = {}
+    for variance in variances:
+        num_singular_values = torch.searchsorted(cumulative_variance_ratios, variance) + 1  # +1 because indices start at 0
+        results[variance] = num_singular_values.item()
+
+    return results
+
+
+def cosine_similarity_matrix(finetuned_param, pretrained_param):
+    finetuned_flat = finetuned_param.view(-1)
+    pretrained_flat = pretrained_param.view(-1)
+    cosine_similarity = F.cosine_similarity(finetuned_flat.unsqueeze(0), pretrained_flat.unsqueeze(0), dim=1)
+    return cosine_similarity.item()
+
+
+def check_delta_properties(delta_weight):
+    # analysis properties for each linear weight in deltas
+
+    # 计算矩阵的Frobenius范数（二范数）
+    matrix_norm = torch.norm(delta_weight, p='fro')
+
+    # 计算矩阵的条件数 
+    # 矩阵的条件数（Condition Number）衡量的是矩阵求逆的数值稳定性。具体来说，它描述了原始数据的微小变化如何影响矩阵运算的结果。条件数越高，计算结果对数据的微小变化越敏感，即数值解可能不稳定；条件数越低，矩阵和其运算则越稳定。
+
+    # 定义
+    # 对于非奇异矩阵 A，其条件数定义为矩阵 A 的范数与 A 的逆的范数的乘积：
+    # 其中，范数可以是任意矩阵范数，但是最常用的是2-范数（即谱范数），此时条件数可以解释为矩阵最大奇异值与最小奇异值的比值。
+    cond_number = torch.linalg.cond(delta_weight)
+
+    # 计算矩阵的秩
+    rank = torch.linalg.matrix_rank(delta_weight)
+
+    # 计算矩阵的有效秩
+    rank_eff = singular_values_for_variance(delta_weight, variances=[0.9, 0.95])
+    rank_90, rank_95 = rank_eff[0.9], rank_eff[0.95]
+
+
+    return matrix_norm, cond_number, rank, rank_90, rank_95
+
+
+
+
+    ## First part: checkout cosine similarity in first layer FFN w1
+
+    # if "llama" in base_model_path:
+    #     #weight_key = "model.layers.0.mlp.gate_proj.weight"
+    #     tensor_base = base_model.model.layers[0].mlp.gate_proj.weight
+    #     tensor_ft = finetuned_model.model.layers[0].mlp.gate_proj.weight
+    #     cosine_sim = F.cosine_similarity(tensor_base, tensor_ft, dim=1)
+    #     overall_similarity = cosine_sim.mean()
+    #     base_model_name = base_model_path.split("/")[-1]
+    #     finetuned_model_name = finetuned_model_path.split("/")[-1]
+    #     overall_similarity_result = overall_similarity.item()
+    #     print(f"Overall Cosine Similarity between {base_model_name} and {finetuned_model_name}: {overall_similarity_result}")
+    #     ## 说明是llama模型
+    # elif "Mixtral" in base_model_path:
+    #     tensor_base = base_model.model.layers[0].block_sparse_moe.experts[0].w1.weight
+    #     tensor_ft = base_model.model.layers[0].block_sparse_moe.experts[1].w1.weight
+    #     cosine_sim = F.cosine_similarity(tensor_base, tensor_ft, dim=1)
+    #     overall_similarity = cosine_sim.mean()
+
+
+
+
+    ## Second part: checkout delta square decline potential using scaled weight
+
+    ## third part: checkout rank of original delta and  
+    ## scaled calculation delta(relation between variance ratio and #singular values)
+
+def analysis_delta(base_model_path, finetuned_model_path):
+    pretrained_model = get_model(base_model_path)
+    finetuned_model = get_model(finetuned_model_path)
+    print(f"We are analysising the delta between the Pretrained model: {base_model_path} and the Finetuned model: {finetuned_model_path}")
+    task_vector_param_dict = {}
+    pretrained_param_dict = {param_name: param_value for param_name, param_value in pretrained_model.named_parameters()}
+    finetuned_param_dict = {param_name: param_value for param_name, param_value in finetuned_model.named_parameters()}
+    param_names_to_merge = get_param_names_to_merge(input_param_names=list(pretrained_param_dict.keys()), exclude_param_names_regex=[])
+
+    cos_sim_list = []
+    norm_list = []
+    cond_number_list = []
+    rank_list = []
+    rank_90_list = []
+    rank_95_list = []
+
+    with torch.no_grad():
+        for param_name in param_names_to_merge:
+            param_list = ['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj']
+            if all(char not in param_name for char in param_list):
+                continue
+            # import pdb
+            # pdb.set_trace()
+            #研究finetuned_param_dict[param_name]和pretrained_param_dict[param_name]的cosine similarity
+            task_vector_param_dict[param_name] = finetuned_param_dict[param_name] - pretrained_param_dict[param_name]
+            #check similarity
+            print(f"Investigating param_name: {param_name}")
+            cos_sim = cosine_similarity_matrix(finetuned_param_dict[param_name].float(), pretrained_param_dict[param_name].float())
+            cos_sim_list.append(cos_sim)
+            print(f"cosine similarity between the finetuned model and pretrained model: ",cos_sim)
+            #研究他们差值的统计性质
+            matrix_norm, cond_number, rank, rank_90, rank_95 = check_delta_properties(task_vector_param_dict[param_name].float())
+            norm_list.append(matrix_norm)
+            cond_number_list.append(cond_number)
+            rank_list.append(rank)
+            rank_90_list.append(rank_90)
+            rank_95_list.append(rank_95)
+            print(f"Properties of Delta Weight---matrix_norm: {matrix_norm}, cond_number: {cond_number}, rank: {rank}, rank_90: {rank_90}, rank_95: {rank_95}")
+
+    print(f"avg_cos_sim: {sum(cos_sim_list)/len(cos_sim_list)}")
+    print(f"avg_norm: {sum(norm_list)/len(norm_list)}")
+    print(f"avg_cond_number: {sum(cond_number_list)/len(cond_number_list)}")
+    print(f"avg_rank: {sum(rank_list)/len(rank_list)}")
+    print(f"avg_rank_90: {sum(rank_90_list)/len(rank_90_list)}")
+    print(f"avg_rank_95: {sum(rank_95_list)/len(rank_95_list)}")
+
+    print(f"Analysis end for the pretrained model: {base_model_path} and finetuned_model: {finetuned_model_path}")
+    del pretrained_model
+    del finetuned_model
+    return
+
+moe_base = "/home/wanghanqing/projects/models/model_ver2/Mixtral-8x7B-v0.1"
+instruct_base = "/home/wanghanqing/projects/models/model_ver2/Mistral-7B-Instruct-v0.2"
+base_model = "/home/wanghanqing/projects/models/model_ver2/Mistral-7B-v0.1"
+
+code_llama13 = "/data/public/opensource_models/codellama/codellama-13b-python-hf"
+wizard_coder = "/data/public/opensource_models/WizardLM/WizardCoder-Python-13B-V1.0"
+llama2_7b = "/data/public/opensource_models/meta-llama/Llama-2-7b-hf"
+llama2_7b_chat = "/data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf"
+llama2_13b = "/data/public/opensource_models/meta-llama/Llama-2-13b-hf"
+llama2_13b_chat = "/data/public/opensource_models/meta-llama/Llama-2-13b-chat-hf"
+wizard_math_7b = "/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0"
+wizard_math_13b = "/data/public/opensource_models/WizardLM/WizardMath-13B-V1.0"
+meta_math_7b = "/data/public/wangshuo/exp/ft-en-metameth-llama-2-7b/ckpts/checkpoints/epoch_2_hf"
+magicoder_7b = "/data/public/wangshuo/exp/ft-en-magicoder-llama-2-7b/ckpts/checkpoints/epoch_2_hf"
+magicoder_13b = "/data/public/wangshuo/exp/ft-en-magicoder-llama-2-13b/ckpts/checkpoints/epoch_2_hf"
+
+
+# Mistral-7B
+## base
+mistral_7b = "/home/wanghanqing/projects/models/model_ver2/Mistral-7B-v0.1"
+## finetuned
+mistral_7b_instruct_v1 = "/home/wanghanqing/projects/models/model_ver2/Mistral-7B-Instruct-v0.1"
+mistral_7b_instruct_v2 = "/home/wanghanqing/projects/models/model_ver2/Mistral-7B-Instruct-v0.2"
+
+# llama2-7b
+## base
+llama2_7b = "/data/public/opensource_models/meta-llama/Llama-2-7b-hf"
+## finetuned
+llama2_7b_chat = "/data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf"
+wizard_math_7b = "/data/public/opensource_models/WizardLM/WizardMath-7B-V1.0"
+meta_math_7b = "/data/public/wangshuo/exp/ft-en-metameth-llama-2-7b/ckpts/checkpoints/epoch_2_hf"
+magicoder_7b = "/data/public/wangshuo/exp/ft-en-magicoder-llama-2-7b/ckpts/checkpoints/epoch_2_hf"
+
+# llama2-13b
+## base
+llama2_13b = "/data/public/opensource_models/meta-llama/Llama-2-13b-hf"
+## finetuned
+llama2_13b_chat = "/data/public/opensource_models/meta-llama/Llama-2-13b-chat-hf"
+wizard_math_13b = "/data/public/opensource_models/WizardLM/WizardMath-13B-V1.0"
+magicoder_13b = "/data/public/wangshuo/exp/ft-en-magicoder-llama-2-13b/ckpts/checkpoints/epoch_2_hf"
+code_llama13 = "/data/public/opensource_models/codellama/codellama-13b-python-hf"
+wizard_coder = "/data/public/opensource_models/WizardLM/WizardCoder-Python-13B-V1.0"
+
+
+
+
+import sys
+
+# 打开一个日志文件
+log_file = open("analysis_log.txt", "w")
+
+# 保存原始的标准输出
+original_stdout = sys.stdout
+
+# 重定向标准输出到文件
+sys.stdout = log_file
+
+# 你的代码，所有print函数的输出都会写入log.txt
+print("This will be written to analysis_log.txt")
+
+
+
+
+
+analysis_delta(base_model_path = llama2_7b, finetuned_model_path = llama2_7b_chat)
+analysis_delta(base_model_path = llama2_7b, finetuned_model_path = wizard_math_7b)
+analysis_delta(base_model_path = llama2_7b, finetuned_model_path = meta_math_7b)
+analysis_delta(base_model_path = llama2_7b, finetuned_model_path = magicoder_7b)
+
+# 恢复原始的标准输出
+sys.stdout = original_stdout
+
+# 关闭日志文件
+log_file.close()
diff --git a/run.sh b/run.sh
@@ -1,4 +1,4 @@
-MODEL_SAVE_DIR=save/uncalibrated_model_0
+MODEL_SAVE_DIR=save/uncalibrated_model_attn_1024_mlp_2048
 
 mkdir -p $MODEL_SAVE_DIR
 

diff --git a/run_tailor.sh b/run_tailor.sh
@@ -0,0 +1,11 @@
+python \
+  tailor.py \
+  --finetuned_model_name /data/public/opensource_models/meta-llama/Llama-2-7b-chat-hf \
+  --save_dir /home/pingbowen/workspace/delta-compression/BitDelta/tailor_model/7b_chat \ 
+
+
+# &
+
+# python3 tailor.py \
+#   --finetuned_model_name /data/public/wangshuo/exp/ft-en-metameth-llama-2-7b/ckpts/checkpoints/epoch_2_hf \
+#   --save_dir /home/pingbowen/workspace/delta-compression/BitDelta/tailor_model/math_lora_7b \