Regression with Different Optimizers

1
!pip install matplotlib==3.3.0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
Collecting matplotlib==3.3.0
  Downloading matplotlib-3.3.0-1-cp37-cp37m-manylinux1_x86_64.whl (11.5 MB)
     |████████████████████████████████| 11.5 MB 4.2 MB/s 
[?25hRequirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.3.0) (7.1.2)
Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.3.0) (2.8.2)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.3.0) (1.3.2)
Requirement already satisfied: numpy>=1.15 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.3.0) (1.19.5)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.3.0) (3.0.7)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib==3.3.0) (0.11.0)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.1->matplotlib==3.3.0) (1.15.0)
Installing collected packages: matplotlib
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.2.2
    Uninstalling matplotlib-3.2.2:
      Successfully uninstalled matplotlib-3.2.2
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.
Successfully installed matplotlib-3.3.0
1
2
3
4
5
6
7
8
9
10
11
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
%matplotlib inline
%config InlineBackend.figure_format='retina'
print ("PyTorch version:[%s]."%(torch.__version__))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print ("device:[%s]."%(device))
1
2
PyTorch version:[1.10.0+cu111].
device:[cuda:0].

Dataset

random noise 를 생성하여 데이터셋을 만들었다

1
2
3
4
5
6
7
8
9
n_data = 10000
x_numpy = -3+6*np.random.rand(n_data,1)
y_numpy = np.exp(-(x_numpy**2))*np.cos(10*x_numpy) + 3e-2*np.random.randn(n_data,1)
plt.figure(figsize=(8,5))
plt.plot(x_numpy,y_numpy,'r.',ms=2)
plt.show()
x_torch = torch.Tensor(x_numpy).to(device)
y_torch = torch.Tensor(y_numpy).to(device)
print ("Done.")

random_noise.jpg

Define Model

model 생성

parameter 정의

xdim,hdim,ydim 각각 정의

xdim = 1
hdim = [16,16]
ydim = 1

layer_ex1.jpg

layers 라는 list 를 생성하여 linear 계층을 생성
Tanh 함수를 사용해 activation 뒤 마지막 layer에 linear 계층을 추가해준다

Concatenate all layers

여기서는 nn.Sequential을 사용하여 layer list 안에 있는 layer 들을 차례대로 실행시켜주도록 한다

init params

이 모델에서는 conv2d 를 사용하지 않았기 떄문에 conv2d 관련 param 은 초기화 되지 않지만 다른 계층을 사용할 시 param 초기화를 예시로 남겨두기 위해 지우지 않았다

forward

train 동작

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
class Model(nn.Module):
    def __init__(self,name='mlp',xdim=1,hdims=[16,16],ydim=1):
        super(Model, self).__init__()
        self.name = name
        self.xdim = xdim
        self.hdims = hdims
        self.ydim = ydim

        self.layers = []
        prev_hdim = self.xdim
        for hdim in self.hdims:
            self.layers.append(nn.Linear(prev_hdim,hdim,bias = True))
            self.layers.append(nn.Tanh())  # activation
            prev_hdim = hdim
        # Final layer (without activation)
        self.layers.append(nn.Linear(prev_hdim,self.ydim,bias=True))

        # Concatenate all layers 
        self.net = nn.Sequential()
        for l_idx,layer in enumerate(self.layers):
            layer_name = "%s_%02d"%(type(layer).__name__.lower(),l_idx)
            self.net.add_module(layer_name,layer)

        self.init_param() # initialize parameters
    
    def init_param(self):
        for m in self.modules():
            if isinstance(m,nn.Conv2d): # init conv
                nn.init.kaiming_normal_(m.weight)
                nn.init.zeros_(m.bias)
            elif isinstance(m,nn.Linear): # lnit dense
                nn.init.kaiming_normal_(m.weight)
                nn.init.zeros_(m.bias)
    
    def forward(self,x):
        return self.net(x)

print ("Done.")        
1
Done.
1
2
3
4
5
6
7
8
9
10
11
LEARNING_RATE = 1e-2
# Instantiate models
model_sgd = Model(name='mlp_sgd',xdim=1,hdims=[64,64],ydim=1).to(device)
model_momentum = Model(name='mlp_momentum',xdim=1,hdims=[64,64],ydim=1).to(device)
model_adam = Model(name='mlp_adam',xdim=1,hdims=[64,64],ydim=1).to(device)
# Optimizers
loss = nn.MSELoss()
optm_sgd = optim.SGD(model_sgd.parameters(),lr = LEARNING_RATE)
optm_momentum = optim.SGD(model_momentum.parameters(),lr = LEARNING_RATE,momentum = 0.9)
optm_adam = optim.Adam(model_adam.parameters(),lr = LEARNING_RATE)
print ("Done.")
1
Done.

Check Parameters

input이 1차원에서 64차원, 64 -> 64 64 -> 1

1
2
3
4
5
6
7
8
9
np.set_printoptions(precision=3)
n_param = 0
for p_idx,(param_name,param) in enumerate(model_sgd.named_parameters()):
    if param.requires_grad:
        param_numpy = param.detach().cpu().numpy() # to numpy array 
        n_param += len(param_numpy.reshape(-1))
        print ("[%d] name:[%s] shape:[%s]."%(p_idx,param_name,param_numpy.shape))
        print ("    val:%s"%(param_numpy.reshape(-1)[:5]))
print ("Total number of parameters:[%s]."%(format(n_param,',d')))
1
2
3
4
5
6
7
8
9
10
11
12
13
[0] name:[net.linear_00.weight] shape:[(64, 1)].
    val:[ 0.411 -1.113 -0.46  -0.317 -1.452]
[1] name:[net.linear_00.bias] shape:[(64,)].
    val:[0. 0. 0. 0. 0.]
[2] name:[net.linear_02.weight] shape:[(64, 64)].
    val:[-0.219  0.093  0.208 -0.118 -0.054]
[3] name:[net.linear_02.bias] shape:[(64,)].
    val:[0. 0. 0. 0. 0.]
[4] name:[net.linear_04.weight] shape:[(1, 64)].
    val:[ 0.123 -0.037 -0.073  0.021  0.185]
[5] name:[net.linear_04.bias] shape:[(1,)].
    val:[0.]
Total number of parameters:[4,353].

Train

3개의 model init param
3개의 model training model 로 바꿔준다

원래는 iter를 넣으면 된다
MAX_ITER 들어간다(epoch 개념 없음)

각 iter마다 전체 데이터 중 batch 사이즈 만큼 뜯어온다 (랜덤으로) 그 후 x_torch 와 y_torch에서 idx 를 가지고 데이터를 뜯어온다

각 update 순서

forward 학습 후 loss 계산
grad 초기화 후
backward 돌린 뒤
optimize step 을 진행한다

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
MAX_ITER,BATCH_SIZE,PLOT_EVERY = 1e4,64,500

model_sgd.init_param()
model_momentum.init_param()
model_adam.init_param()

model_sgd.train()
model_momentum.train()
model_adam.train()

for it in range(int(MAX_ITER)):
    r_idx = np.random.permutation(n_data)[:BATCH_SIZE]
    batch_x,batch_y = x_torch[r_idx],y_torch[r_idx]
    
    # Update with Adam
    y_pred_adam = model_adam.forward(batch_x)
    loss_adam = loss(y_pred_adam,batch_y)
    optm_adam.zero_grad()
    loss_adam.backward()
    optm_adam.step()

    # Update with Momentum
    y_pred_momentum = model_momentum.forward(batch_x)
    loss_momentum = loss(y_pred_momentum,batch_y)
    optm_momentum.zero_grad()
    loss_momentum.backward()
    optm_momentum.step()

    # Update with SGD
    y_pred_sgd = model_sgd.forward(batch_x)
    loss_sgd = loss(y_pred_sgd,batch_y)
    optm_sgd.zero_grad()
    loss_sgd.backward()
    optm_sgd.step()
    

    # Plot
    if ((it%PLOT_EVERY)==0) or (it==0) or (it==(MAX_ITER-1)):
        with torch.no_grad():
            y_sgd_numpy = model_sgd.forward(x_torch).cpu().detach().numpy()
            y_momentum_numpy = model_momentum.forward(x_torch).cpu().detach().numpy()
            y_adam_numpy = model_adam.forward(x_torch).cpu().detach().numpy()
            
            plt.figure(figsize=(8,4))
            plt.plot(x_numpy,y_numpy,'r.',ms=4,label='GT')
            plt.plot(x_numpy,y_sgd_numpy,'g.',ms=2,label='SGD')
            plt.plot(x_numpy,y_momentum_numpy,'b.',ms=2,label='Momentum')
            plt.plot(x_numpy,y_adam_numpy,'k.',ms=2,label='ADAM')
            plt.title("[%d/%d]"%(it,MAX_ITER),fontsize=15)
            plt.legend(labelcolor='linecolor',loc='upper right',fontsize=15)
            plt.show()

print ("Done.")

conclusion

image.png

image.png

Discussion

차이가 생기는 이유

adam 은 momentm, adaptive learning rate 합친것 -> 성능이 더 좋다 그렇다면 sgd 와 momentum 이 왜 차이가 날까?

momentum

momentum -> 이전의 gradient를 활용해서 다음에 쓰겠다
이전의 batch 에서 얻어진 gradient 정보를 반영해서 업데이트 해주기 때문에 한번에 많이 보는 효과를 가져온다

adam

adam은 momentum 에 adaptive learning rate 을 합치기 때문에 같은 base learning rate 가지고 있어도 훨씬 더 빠른 학습

sgd

sgd만 가지고 하면 전체 data가 converge 할 때 까지 갈 수 있다
제일 큰 peak 만 잡는 이유는 squared loss 를 사용하게 되면 많이 틀리는 곳을 많이 맞추고 적게 틀리는 곳을 적게 맞추게 된다
때문에 squared loss가 항상 좋은 결과를 가져오는것은 아니다 라는 것을 알 수 있다

왜 minibatch training 시 momentum이 좋을까?
전에 흘러간 gradient 가 다음에 활용되지 않으면 아주 작은 부분만 모델링 할수 있음(한번의 train 시에)