Skip to content

Commit 2ec4be7

Browse files
authored
Add snip_momentum structured pruning example with 80% sparsity ratio (deepspeedai#348)
1 parent 40e33a4 commit 2ec4be7

2 files changed

Lines changed: 264 additions & 0 deletions

File tree

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#!/bin/bash
2+
DIR=`pwd`
3+
export CUDA_VISIBLE_DEVICES=0
4+
TASK_NAME=mnli #mnli sst2 stsb mnli qqp rte cola mrpc qnli
5+
STAGE=one_stage
6+
LRATE=5e-5
7+
EPOCH=10
8+
WARMUP_EPOCH=1
9+
BATCH_SIZE_PER_GPU=32
10+
NAME="pruning_sparse"
11+
SAVE_PATH=./out/${NAME}/
12+
mkdir -p ${SAVE_PATH}
13+
14+
###Layer Reduction
15+
LAYER_REDUCTION_ENABLE="false"
16+
FP16_ENABLE="false"
17+
18+
###weight quantization
19+
WEIGHT_QUANT_ENABLE="false"
20+
Q_GROUP=64
21+
W_BIT1=4
22+
W_BIT2=2
23+
###activation quantization
24+
ACTIVATION_QUANT_ENABLE="false"
25+
A_BIT1=8
26+
A_BIT2=4
27+
#############pruning
28+
###sparse_pruning (structural pruning)
29+
SPARSE_PRUNING_ENABLE="true" #<=============================================================
30+
SPARSE_PRUNING_BLOCK_PATTERN="\"4x1\""
31+
SPARSE_PRUNING_OFFSET_STRIDE=1000
32+
SPARSE_PRUNING_OFFSET=1000
33+
SPARSE_PRUNING_OFFSET_END=51000
34+
SPARSE_PRUNING_EXCLUDED_MODULES="[\"classifier\", \"pooler\"]"
35+
S_DENSE_RATIO=0.2 #<=============================================================
36+
###row_pruning (unstructure pruning)
37+
ROW_PRUNING_ENABLE="false"
38+
R_DENSE_RATIO=0.6
39+
###HEAD_PRUNING_ENABLE
40+
HEAD_PRUNING_ENABLE="false"
41+
H_DENSE_RATIO=0.6
42+
43+
template_json="config/ds_config_structural_pruning_TEMPLATE.json"
44+
config_json="config/ds_config_structural_${NAME}.json"
45+
46+
47+
if [ "${FP16_ENABLE}" = "true" ]; then
48+
QuantW_FORWARD="false"
49+
else
50+
QuantW_FORWARD="true"
51+
fi
52+
sed "s/LAYER_REDUCTION_ENABLE/${LAYER_REDUCTION_ENABLE}/" ${template_json} \
53+
| sed "s/WEIGHT_QUANT_ENABLE/${WEIGHT_QUANT_ENABLE}/" \
54+
| sed "s/Q_GROUP/${Q_GROUP}/" \
55+
| sed "s/W_BIT1/${W_BIT1}/" \
56+
| sed "s/W_BIT2/${W_BIT2}/" \
57+
| sed "s/ACTIVATION_QUANT_ENABLE/${ACTIVATION_QUANT_ENABLE}/" \
58+
| sed "s/A_BIT1/${A_BIT1}/" \
59+
| sed "s/A_BIT2/${A_BIT2}/" \
60+
| sed "s/SPARSE_PRUNING_ENABLE/${SPARSE_PRUNING_ENABLE}/" \
61+
| sed "s/SPARSE_PRUNING_BLOCK_PATTERN/${SPARSE_PRUNING_BLOCK_PATTERN}/" \
62+
| sed "s/SPARSE_PRUNING_OFFSET_STRIDE/${SPARSE_PRUNING_OFFSET_STRIDE}/" \
63+
| sed "s/SPARSE_PRUNING_OFFSET_END/${SPARSE_PRUNING_OFFSET_END}/" \
64+
| sed "s/SPARSE_PRUNING_OFFSET/${SPARSE_PRUNING_OFFSET}/" \
65+
| sed "s/SPARSE_PRUNING_EXCLUDED_MODULES/${SPARSE_PRUNING_EXCLUDED_MODULES}/" \
66+
| sed "s/S_DENSE_RATIO/${S_DENSE_RATIO}/" \
67+
| sed "s/ROW_PRUNING_ENABLE/${ROW_PRUNING_ENABLE}/" \
68+
| sed "s/R_DENSE_RATIO/${R_DENSE_RATIO}/" \
69+
| sed "s/HEAD_PRUNING_ENABLE/${HEAD_PRUNING_ENABLE}/" \
70+
| sed "s/H_DENSE_RATIO/${H_DENSE_RATIO}/" \
71+
| sed "s/FP16_ENABLE/${FP16_ENABLE}/" \
72+
| sed "s/QuantW_FORWARD/${QuantW_FORWARD}/" \
73+
| sed "s/BATCH_SIZE_PER_GPU/${BATCH_SIZE_PER_GPU}/" \
74+
> ${config_json}
75+
76+
CONFIG=${config_json}
77+
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% if users provide *NO* models, use the following script %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
78+
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% the following command will first download huggingface models and then compress %%%%%%%
79+
MODEL=yoshitomo-matsubara/bert-base-uncased-${TASK_NAME} ## for both student and teacher
80+
run_cmd="python -m torch.distributed.launch --nproc_per_node=1 \
81+
--master_port 6618 \
82+
run_glue_no_trainer.py \
83+
--seed 42 \
84+
--distill_method ${STAGE} \
85+
--model_name_or_path ${MODEL} \
86+
--task_name $TASK_NAME \
87+
--max_length 128 \
88+
--pad_to_max_length \
89+
--per_device_train_batch_size ${BATCH_SIZE_PER_GPU} \
90+
--per_device_eval_batch_size 64 \
91+
--learning_rate $LRATE \
92+
--num_train_epochs ${EPOCH}\
93+
--num_warmup_epochs ${WARMUP_EPOCH} \
94+
--eval_step 1000 \
95+
--deepspeed_config ${CONFIG} \
96+
--deepspeed \
97+
--save_best_model --clean_best_model \
98+
--gradient_accumulation_steps 1 \
99+
--output_dir ${SAVE_PATH} | tee -a ${SAVE_PATH}/train.log"
100+
101+
echo ${run_cmd}
102+
eval ${run_cmd}
103+
set +x
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
{
2+
"train_batch_size": 32,
3+
"train_micro_batch_size_per_gpu": BATCH_SIZE_PER_GPU,
4+
"steps_per_print": 200,
5+
"zero_optimization": {
6+
"stage": 0
7+
},
8+
"fp16": {
9+
"enabled": FP16_ENABLE
10+
},
11+
"gradient_clipping": 1.0,
12+
"prescale_gradients": true,
13+
"wall_clock_breakdown": false,
14+
"compression_training": {
15+
"layer_reduction": {
16+
"enabled": LAYER_REDUCTION_ENABLE,
17+
"keep_number_layer": 5,
18+
"module_name_prefix": "bert.encoder.layer",
19+
"teacher_layer": [
20+
2,
21+
4,
22+
6,
23+
8,
24+
10
25+
],
26+
"other_module_name": [
27+
"bert.pooler",
28+
"bert.embeddings",
29+
"classifier"
30+
]
31+
},
32+
"weight_quantization": {
33+
"shared_parameters": {
34+
"enabled": WEIGHT_QUANT_ENABLE,
35+
"quantizer_kernel": false,
36+
"schedule_offset": 0,
37+
"quantize_groups": Q_GROUP,
38+
"quantize_verbose": false,
39+
"quantization_type": "symmetric",
40+
"quantize_weight_in_forward": QuantW_FORWARD,
41+
"rounding": "nearest",
42+
"fp16_mixed_quantize": {
43+
"enabled": false,
44+
"quantize_change_ratio": 0.1
45+
}
46+
},
47+
"different_groups": {
48+
"wq1": {
49+
"params": {
50+
"start_bits": W_BIT1,
51+
"target_bits": W_BIT1,
52+
"quantization_period": 0
53+
},
54+
"modules": [
55+
"attention.self",
56+
"word_embeddings"
57+
]
58+
},
59+
"wq2": {
60+
"params": {
61+
"start_bits": W_BIT2,
62+
"target_bits": W_BIT2,
63+
"quantization_period": 0
64+
},
65+
"modules": [
66+
"output.dense",
67+
"intermediate"
68+
]
69+
}
70+
}
71+
},
72+
"activation_quantization": {
73+
"shared_parameters": {
74+
"enabled": ACTIVATION_QUANT_ENABLE,
75+
"quantization_type": "symmetric",
76+
"range_calibration": "dynamic",
77+
"schedule_offset": 0
78+
},
79+
"different_groups": {
80+
"aq1": {
81+
"params": {
82+
"bits": A_BIT1
83+
},
84+
"modules": [
85+
"attention.self"
86+
]
87+
},
88+
"aq2": {
89+
"params": {
90+
"bits": A_BIT2
91+
},
92+
"modules": [
93+
"output.dense",
94+
"intermediate"
95+
]
96+
}
97+
}
98+
},
99+
"sparse_pruning": {
100+
"shared_parameters": {
101+
"enabled": SPARSE_PRUNING_ENABLE,
102+
"schedule_offset": SPARSE_PRUNING_OFFSET,
103+
"schedule_offset_end": SPARSE_PRUNING_OFFSET_END,
104+
"schedule_offset_stride": SPARSE_PRUNING_OFFSET_STRIDE,
105+
"method": "snip_momentum",
106+
"block_pattern": SPARSE_PRUNING_BLOCK_PATTERN,
107+
"dense_ratio": S_DENSE_RATIO,
108+
"excluded_modules": SPARSE_PRUNING_EXCLUDED_MODULES
109+
},
110+
"different_groups": {
111+
}
112+
},
113+
"row_pruning": {
114+
"shared_parameters": {
115+
"enabled": ROW_PRUNING_ENABLE,
116+
"schedule_offset": 2000,
117+
"method": "topk"
118+
},
119+
"different_groups": {
120+
"rp1": {
121+
"params": {
122+
"dense_ratio": R_DENSE_RATIO
123+
},
124+
"modules": [
125+
"intermediate.dense"
126+
],
127+
"related_modules": [
128+
[
129+
"layer.\\w+.output.dense"
130+
]
131+
]
132+
}
133+
}
134+
},
135+
"head_pruning": {
136+
"shared_parameters": {
137+
"enabled": HEAD_PRUNING_ENABLE,
138+
"schedule_offset": 2000,
139+
"method": "topk",
140+
"num_heads": 12
141+
},
142+
"different_groups": {
143+
"rp1": {
144+
"params": {
145+
"dense_ratio": H_DENSE_RATIO
146+
},
147+
"modules": [
148+
"attention.output.dense"
149+
],
150+
"related_modules": [
151+
[
152+
"self.query",
153+
"self.key",
154+
"self.value"
155+
]
156+
]
157+
}
158+
}
159+
}
160+
}
161+
}

0 commit comments

Comments
 (0)