forked from jingyaogong/minimind
-
Notifications
You must be signed in to change notification settings - Fork 0
/
eval_ceval.py
183 lines (147 loc) · 6.87 KB
/
eval_ceval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import random
import time
import os
import pandas as pd
import torch
import warnings
from transformers import AutoTokenizer, AutoModelForCausalLM
from model.model import Transformer
from model.LMConfig import LMConfig
import torch.nn.functional as F
warnings.filterwarnings('ignore')
def init_model(lm_config):
tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer',
trust_remote_code=True, use_fast=False)
model_from = 1 # 1从权重,2用transformers
if model_from == 1:
moe_path = '_moe' if lm_config.use_moe else ''
ckp = f'./out/single_chat/full_sft_{lm_config.dim}{moe_path}.pth'
model = Transformer(lm_config)
state_dict = torch.load(ckp, map_location=device)
# 处理不需要的前缀
unwanted_prefix = '_orig_mod.'
for k, v in list(state_dict.items()):
if k.startswith(unwanted_prefix):
state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
# 加载到模型中
model.load_state_dict(state_dict, strict=False)
else:
model = AutoModelForCausalLM.from_pretrained('minimind', trust_remote_code=True)
model = model.to(device)
return model, tokenizer
if __name__ == "__main__":
# -----------------------------------------------------------------------------
seed = random.randint(1, 2000)
# device = 'cuda:0'
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
dtype = 'bfloat16'
lm_config = LMConfig()
# -----------------------------------------------------------------------------
model, tokenizer = init_model(lm_config)
model = model.eval()
# 消息模板,具体实现根据你的tokenizer进行调整
messages_origin = [{"role": "system", "content": "开始回答问题"}]
# 定义文件目录
File_Dir = "ceval/ceval-exam/val"
results_dir = "ceval/ceval_result"
# 确保结果目录存在
if not os.path.exists(results_dir):
os.makedirs(results_dir)
# 用于记录所有文件的总正确数和总题数
total_correct = 0
total_questions = 0
# 遍历目录下的所有CSV文件
for filename in os.listdir(File_Dir):
if filename.endswith('.csv'):
file_path = os.path.join(File_Dir, filename)
test_df = pd.read_csv(file_path)
# 存储结果的DataFrame
results_df = pd.DataFrame(columns=['question', 'A', 'B', 'C', 'D', 'answer', 'llm_answer', 'is_right'])
total_correct_in_file = 0 # 用于记录当前文件的正确数
for row in test_df.itertuples(index=True, name='Pandas'):
id = getattr(row, 'id')
question = getattr(row, 'question')
A = getattr(row, 'A')
B = getattr(row, 'B')
C = getattr(row, 'C')
D = getattr(row, 'D')
right_answer = getattr(row, 'answer')
prompt = f'{question}。选择 A: {A}, B: {B}, C: {C}, D: {D}'
messages = messages_origin.copy()
messages.append({"role": "user", "content": prompt})
# print(messages)
new_prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
x = tokenizer(new_prompt).data['input_ids']
x = (torch.tensor(x, dtype=torch.long, device=device)[None, ...])
res_ids = model.eval_answer(x)
# 假设 res_ids 是模型的 logits 输出,我们使用 softmax 转换为概率分布
probabilities = F.softmax(res_ids, dim=-1)
# 定义每个选项的 token id
A_id = tokenizer('A').data['input_ids']
B_id = tokenizer('B').data['input_ids']
C_id = tokenizer('C').data['input_ids']
D_id = tokenizer('D').data['input_ids']
# 获取每个选项的概率
A_prob = probabilities[0, A_id].item()
B_prob = probabilities[0, B_id].item()
C_prob = probabilities[0, C_id].item()
D_prob = probabilities[0, D_id].item()
# 将每个选项的概率放入字典中便于处理
options_prob = {
'A': A_prob,
'B': B_prob,
'C': C_prob,
'D': D_prob
}
# 找到具有最大概率的选项
max_option_answer = max(options_prob, key=options_prob.get)
# 比较答案并记录
is_right = 1 if max_option_answer == right_answer else 0
results_df = results_df.append({
'question': question,
'A': A,
'B': B,
'C': C,
'D': D,
'answer': right_answer,
'llm_answer': max_option_answer,
'is_right': is_right
}, ignore_index=True)
# print(f'id: {id} 问题: {question[:10]}... 是否正确: {is_right}')
if is_right:
total_correct_in_file += 1
total_correct += total_correct_in_file
total_questions += len(test_df)
# 计算当前文件的正确率并添加到结果DataFrame的最后一行
accuracy = total_correct_in_file / len(test_df)
results_df = results_df.append({
'question': '-',
'A': '-',
'B': '-',
'C': '-',
'D': '-',
'answer': f'文件 {filename} 的正确率: {accuracy:.2%}',
'llm_answer': '-',
'is_right': '-'
}, ignore_index=True)
print(f'{filename.split(".")[0]} ,{total_correct_in_file}/{len(test_df)},正确率: {accuracy:.2%}')
# 保存结果到CSV
results_path = os.path.join(results_dir, f"{filename.split('.')[0]}_result.csv")
results_df.to_csv(results_path, index=False)
# 计算总正确率
total_accuracy = total_correct / total_questions if total_questions > 0 else 0
# 将各个文件的正确率以及总正确率写入到 "ceval/ceval_result/test.log"
log_path = os.path.join(results_dir, "test.log")
with open(log_path, 'w') as log_file:
result = f"总题数: {total_questions}\n总正确数: {total_correct}\n总正确率: {total_accuracy:.2%}"
log_file.write(result)
print(result)
for filename in os.listdir(File_Dir):
if filename.endswith('.csv'):
accuracy_file = pd.read_csv(os.path.join(results_dir, f"{filename.split('.')[0]}_result.csv"))
last_row = accuracy_file.iloc[-1]['answer']
log_file.write(f"{filename}: {last_row}\n")