[机器学习] 作业 壹 2022Spring-hw1 COVID-19 Cases Prediction
zerc

Download Data

1
2
3
!gdown https://drive.google.com/uc?id=1kLSW_-cW2Huj7bh84YTdimGBOJaODiOS --output covid.train.csv

!gdown https://drive.google.com/uc?id=1iiI5qROrAhZn-o4FPqsE97bMzDEFvIdg --output covid.test.csv

Data: 118 features (id + 37 + 16 * 5, label included)

Import Packages

1
2
3
4
5
6
7
8
9
10
11
12
13
14
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split
from torch.utils.tensorboard import SummaryWriter

import numpy as np
import math

import pandas as pd
import csv
import os

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

Same Seed

保证实验的可重复性

1
2
3
4
5
6
7
8
9
def same_seed(seed):
# 使用确定性的卷积算法
torch.backends.cudnn.deterministic = True
# 禁止为卷积层选择最优算法
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)

Dataset

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
def data_set_split(data_set, valid_ratio, seed):
valid_data_size = int(len(data_set) * valid_ratio)
train_data_size = len(data_set) - valid_data_size
train_set, valid_set = random_split(data_set, [train_data_size, valid_data_size], generator=torch.Generator().manual_seed(seed))
return np.array(train_set), np.array(valid_set)

def select_feature(train_set, valid_set, test_set):
label_train_set = train_set[:, -1]
label_valid_set = valid_set[:, -1]

feature_train_set = train_set[:, :-1]
feature_valid_set = valid_set[:, :-1]
feature_test_set = test_set

return feature_train_set, feature_valid_set, feature_test_set, label_train_set, label_valid_set

class COVID19Dataset(Dataset):
def __init__(self, features, targets=None):
if targets is None:
self.targets = targets
else:
self.targets = torch.FloatTensor(targets)
self.features = torch.FloatTensor(features)

def __getitem__(self, idx):
if self.targets is None:
return self.features[idx]
else:
return self.features[idx], self.targets[idx]

def __len__(self):
return len(self.features)

Neural Network Model

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
class my_model(nn.Module):
def __init__(self, input_dim):
super(my_model, self).__init__()
self.layers = nn.Sequential(
nn.Linear(input_dim, 64),
nn.ReLU(),
# nn.Linear(64, 64),
# nn.ReLU(),
# nn.Linear(64, 64),
# nn.ReLU(),
# nn.Linear(64, 64),
# nn.ReLU(),
# nn.Linear(64, 32),
# nn.ReLU(),
nn.Linear(64, 1),
)

def forward(self, x):
x = self.layers(x)
x = x.squeeze(1)
return x

Hyper Parameters

1
2
3
4
5
6
7
8
9
10
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = {
'seed': 7,
'valid_ratio': 0.2,
'n_epochs': 5000,
'batch_size': 256,
'lr': 1e-5,
'early_stop': 800,
'save_path': './models/model.ckpt'
}

Training Progress

1
2
loss_record_train = []
loss_record_valid = []
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def train(train_loader, valid_loader, model, config, device):
criterion = nn.MSELoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=config['lr'], momentum=0.9)
writer = SummaryWriter()
if not os.path.exists('./models'):
os.mkdir('./models')
n_epochs = config['n_epochs']
best_loss = math.inf
step = 0
early_stop_count = 0

for epoch in range(n_epochs):
model.train()
loss_record = []
for x, y in train_loader:
optimizer.zero_grad()
x, y = x.to(device), y.to(device)
pred = model(x)
loss = criterion(pred, y)
loss.backward()
optimizer.step()
step += 1
loss_record.append(loss.detach().item())
loss_record_train.append(loss.detach().item())
mean_train_loss = sum(loss_record) / len(loss_record)
writer.add_scalar('loss/train', mean_train_loss, step)

model.eval()
loss_record = []
for x, y in valid_loader:
x, y = x.to(device), y.to(device)
with torch.no_grad():
pred = model(x)
loss = criterion(pred, y)
loss_record.append(loss.item())

mean_valid_loss = sum(loss_record) / len(loss_record)
loss_record_valid.append(mean_valid_loss)
writer.add_scalar('loss/valid', mean_valid_loss, step)
print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}]')

if mean_valid_loss < best_loss:
best_loss = mean_valid_loss
torch.save(model.state_dict(), config['save_path'])
print(f'Saving model[{epoch+1}/{n_epochs}] with valid loss {best_loss:.3f}')
early_stop_count = 0
else:
early_stop_count += 1

if early_stop_count > config['early_stop']:
print('Early stopping')
break

Prepare

1
2
3
4
5
6
7
8
9
10
11
12
13
same_seed(config['seed'])
train_data = pd.read_csv('./covid.train.csv').drop(columns=['id']).values
test_set = pd.read_csv('./covid.test.csv').drop(columns=['id']).values
train_set, valid_set = data_set_split(train_data, config['valid_ratio'], config['seed'])
feature_train, feature_valid, feature_test, label_train, label_valid = select_feature(train_set, valid_set, test_set)

train_dataset = COVID19Dataset(feature_train, label_train)
valid_dataset = COVID19Dataset(feature_valid, label_valid)
test_dataset = COVID19Dataset(feature_test)

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False)

Start

1
2
model = my_model(input_dim=feature_train.shape[1]).to(device)
train(train_loader, valid_loader, model, config, device)

Plot Learning Curve

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
def plot_learning_curve():
total_steps = len(loss_record_train)
x_1 = range(total_steps)
x_2 = x_1[::len(loss_record_train) // len(loss_record_valid)]
figure(figsize=(6, 4))
plt.plot(x_1, loss_record_train, c='tab:red', label='train')
plt.plot(x_2, loss_record_valid, c='tab:cyan', label='mean valid')
plt.ylim(0.0, 5.)
plt.xlabel('Training steps')
plt.ylabel('MSE loss')
plt.title('Learning curve of COVID-19 model')
plt.legend()
plt.show()

plot_learning_curve()

image

Predict

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
def predict(test_set, model, config):
model.eval()
preds = []
for x in test_loader:
x = x.to(device)
with torch.no_grad():
pred = model(x)
preds.append(pred.detach().cpu())
preds = torch.cat(preds, dim=0).numpy()
return preds

def save_pred(preds, file):
with open(file, 'w') as fp:
writer = csv.writer(fp)
writer.writerow(['id', 'tested_positive'])
for i, p in enumerate(preds):
writer.writerow([i, p])

model = my_model(input_dim=feature_train.shape[1]).to(device)
model.load_state_dict(torch.load(config['save_path']))
preds = predict(test_loader, model, config)
save_pred(preds, './pred.csv')
Powered by Hexo & Theme Keep
Total words 7.1k Unique Visitor Page View