BERT 实现 IMDb 情感分类

题目

  1. 在本次挑战中,你将使用IMDb电影评论数据集,该数据集包含50,000条影评,其中25,000条用于训练,25,000条用于测试。每条评论都被标记为正面或负面情感。
  2. 你的任务是使用BERT模型,对该数据集进行情感分类。具体要求如下:
    1. 数据集下载:请下载IMDb数据集,确保数据集中包含train和test两个文件夹,分别用于训练和测试。
    2. 数据预处理:对文本数据进行必要的预处理,包括分词、去除停用词、填充等,以便用于BERT模型的训练。
    3. 模型训练:利用BERT模型对训练数据集进行训练,调整模型参数,使其在测试集上取得尽可能高的分类准确度。
    4. 模型评估:在训练过程中,及时监控模型在测试集上的性能,并记录模型在测试集上的分类准确率,将结果可视化(如损失函数,预测准确率等)。

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import OneCycleLR # type: ignore
from transformers import BertTokenizer, BertForSequenceClassification
from torchtext.datasets import IMDB
import matplotlib.pyplot as plt

# 设备设置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载 BERT 分词器
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


def text_pipeline(text):
"""
:param text: 原始文本
:return: 处理后的 token id 和 attention mask
"""
return tokenizer(text, padding="max_length", truncation=True, max_length=100, return_tensors="pt")


def label_pipeline(label):
"""
:param label: 电影评论标签(pos/neg)
:return: 1 表示正面,0 表示负面
"""
return 1 if label == "pos" else 0


class IMDBDataset(Dataset):
"""自定义 IMDB 数据集类"""
def __init__(self, data_iter):
"""
:param data_iter: IMDB 数据集迭代器
"""
self.data = []
for label, text in data_iter:
encoding = text_pipeline(text)
self.data.append((
encoding["input_ids"].squeeze(0),
encoding["attention_mask"].squeeze(0),
label_pipeline(label)
))

def __len__(self):
"""返回数据集大小"""
return len(self.data)

def __getitem__(self, idx):
"""获取数据集中的单个样本"""
return self.data[idx]


# 加载 IMDB 数据集
train_iter, test_iter = IMDB(split="train"), IMDB(split="test")
train_data, test_data = IMDBDataset(train_iter), IMDBDataset(test_iter)


def collate_fn(batch):
"""
处理批量数据
:param batch: 输入数据
:return: 处理后的 input_ids, attention_masks, labels
"""
input_ids, attention_masks, labels = zip(*batch)
return torch.stack(input_ids), torch.stack(attention_masks), torch.tensor(labels, dtype=torch.long)


# 创建数据加载器
train_loader = DataLoader(train_data, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False, collate_fn=collate_fn)

# 加载预训练 BERT 模型
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=1).to(device)

# 只训练 BERT 的最后 4 层
for param in model.bert.encoder.layer[:-4].parameters():
param.requires_grad = False

# 定义优化器、损失函数和学习率调度器
optimizer = optim.AdamW(model.parameters(), lr=5e-5, weight_decay=1e-2)
criterion = nn.BCEWithLogitsLoss().to(device)

EPOCHS = 3
scheduler = OneCycleLR(
optimizer,
max_lr=5e-5,
total_steps=EPOCHS * len(train_loader),
pct_start=0.3,
anneal_strategy='cos',
div_factor=10,
final_div_factor=100
)

# 训练模型
def train(model, loader, optimizer, criterion, scheduler):
model.train()
epoch_loss = 0
for input_ids, attention_masks, labels in loader:
input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)

optimizer.zero_grad()
outputs = model(input_ids, attention_mask=attention_masks)
logits = outputs.logits.squeeze(1)

loss = criterion(logits, labels.float())
loss.backward()
optimizer.step()
scheduler.step()

epoch_loss += loss.item()
return epoch_loss / len(loader)

# 评估模型
def evaluate(model, loader, criterion):
model.eval()
epoch_loss, correct, total = 0, 0, 0
with torch.no_grad():
for input_ids, attention_masks, labels in loader:
input_ids, attention_masks, labels = input_ids.to(device), attention_masks.to(device), labels.to(device)

outputs = model(input_ids, attention_mask=attention_masks)
logits = outputs.logits.squeeze(1)

loss = criterion(logits, labels.float())
epoch_loss += loss.item()

preds = torch.sigmoid(logits) > 0.5
correct += (preds == labels).sum().item()
total += labels.size(0)
return epoch_loss / len(loader), correct / total


# 训练过程
train_losses, test_losses, test_accs = [], [], []
for epoch in range(EPOCHS):
train_loss = train(model, train_loader, optimizer, criterion, scheduler)
test_loss, test_acc = evaluate(model, test_loader, criterion)
train_losses.append(train_loss)
test_losses.append(test_loss)
test_accs.append(test_acc)
print(f'Epoch: {epoch + 1:02}, Train Loss: {train_loss:.3f}, Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2%}')

# 保存模型
torch.save(model.state_dict(), "bert_imdb.pth")

# 绘制训练曲线
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label="Train Loss")
plt.plot(test_losses, label="Test Loss")
plt.legend()
plt.title("Loss Curve")
plt.subplot(1, 2, 2)
plt.plot(test_accs, label="Test Accuracy")
plt.legend()
plt.title("Accuracy Curve")
plt.tight_layout()
plt.show()

结果

alt text

Contents
  1. 1. 题目
  2. 2. 代码
  3. 3. 结果
|