Python:PyTorch 保存和加载训练过的网络 (八十)

保存和加载模型

在这个 notebook 中,我将为你展示如何使用 Pytorch 来保存和加载模型。这个步骤十分重要,因为你一定希望能够加载预先训练好的模型来进行预测,或是根据新数据继续训练。

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt

import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable
from torchvision import datasets, transforms

import helper
# Define a transform to normalize the data
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
# Download and load the training data
trainset = datasets.FashionMNIST('F_MNIST_data/', download=True, train=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)

# Download and load the test data
testset = datasets.FashionMNIST('F_MNIST_data/', download=True, train=False, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=True)

在这里我们可以看见一张图片。

image, label = next(iter(trainloader))
helper.imshow(image[0,:]);

file

file

构建网络

在这里,我将使用与第五部分中一样的模型。

class Network(nn.Module):
    def __init__(self, input_size, output_size, hidden_layers, drop_p=0.5):
        ''' Builds a feedforward network with arbitrary hidden layers.

            Arguments
            ---------
            input_size: integer, size of the input layer
            output_size: integer, size of the output layer
            hidden_layers: list of integers, the sizes of the hidden layers

        '''
        super().__init__()
        # Input to a hidden layer
        self.hidden_layers = nn.ModuleList([nn.Linear(input_size, hidden_layers[0])])

        # Add a variable number of more hidden layers
        layer_sizes = zip(hidden_layers[:-1], hidden_layers[1:])
        self.hidden_layers.extend([nn.Linear(h1, h2) for h1, h2 in layer_sizes])

        self.output = nn.Linear(hidden_layers[-1], output_size)

        self.dropout = nn.Dropout(p=drop_p)

    def forward(self, x):
        ''' Forward pass through the network, returns the output logits '''

        for each in self.hidden_layers:
            x = F.relu(each(x))
            x = self.dropout(x)
        x = self.output(x)

        return F.log_softmax(x, dim=1)

训练网络

并使用之前一样的方法来训练网络。

# Create the network, define the criterion and optimizer
model = Network(784, 10, [500, 100])
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 2
steps = 0
running_loss = 0
print_every = 100
for e in range(epochs):
    for images, labels in iter(trainloader):
        steps += 1
        # Flatten images into a 784 long vector
        images.resize_(images.size()[0], 784)

        # Wrap images and labels in Variables so we can calculate gradients
        inputs = Variable(images)
        targets = Variable(labels)
        optimizer.zero_grad()

        output = model.forward(inputs)
        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.data[0]

        if steps % print_every == 0:
            # Model in inference mode, dropout is off
            model.eval()

            accuracy = 0
            test_loss = 0
            for ii, (images, labels) in enumerate(testloader):

                images = images.resize_(images.size()[0], 784)
                # Set volatile to True so we don't save the history
                inputs = Variable(images, volatile=True)
                labels = Variable(labels, volatile=True)

                output = model.forward(inputs)
                test_loss += criterion(output, labels).data[0]

                ## Calculating the accuracy 
                # Model's output is log-softmax, take exponential to get the probabilities
                ps = torch.exp(output).data
                # Class with highest probability is our predicted class, compare with true label
                equality = (labels.data == ps.max(1)[1])
                # Accuracy is number of correct predictions divided by all predictions, just take the mean
                accuracy += equality.type_as(torch.FloatTensor()).mean()

            print("Epoch: {}/{}.. ".format(e+1, epochs),
                  "Training Loss: {:.3f}.. ".format(running_loss/print_every),
                  "Test Loss: {:.3f}.. ".format(test_loss/len(testloader)),
                  "Test Accuracy: {:.3f}".format(accuracy/len(testloader)))

            running_loss = 0

            # Make sure dropout is on for training
            model.train()
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:21: UserWarning: invalid index of a 0-dim tensor. This will be an error in PyTorch 0.5. Use tensor.item() to convert a 0-dim tensor to a Python number
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:33: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:34: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:37: UserWarning: invalid index of a 0-dim tensor. This will be an error in PyTorch 0.5. Use tensor.item() to convert a 0-dim tensor to a Python number

Epoch: 1/2..  Training Loss: 1.114..  Test Loss: 0.655..  Test Accuracy: 0.752
Epoch: 1/2..  Training Loss: 0.749..  Test Loss: 0.594..  Test Accuracy: 0.781
Epoch: 1/2..  Training Loss: 0.654..  Test Loss: 0.567..  Test Accuracy: 0.784
Epoch: 1/2..  Training Loss: 0.621..  Test Loss: 0.498..  Test Accuracy: 0.811
Epoch: 1/2..  Training Loss: 0.600..  Test Loss: 0.518..  Test Accuracy: 0.807
Epoch: 1/2..  Training Loss: 0.551..  Test Loss: 0.494..  Test Accuracy: 0.816
Epoch: 1/2..  Training Loss: 0.565..  Test Loss: 0.476..  Test Accuracy: 0.824
Epoch: 1/2..  Training Loss: 0.561..  Test Loss: 0.479..  Test Accuracy: 0.821
Epoch: 1/2..  Training Loss: 0.522..  Test Loss: 0.476..  Test Accuracy: 0.827
Epoch: 2/2..  Training Loss: 0.539..  Test Loss: 0.461..  Test Accuracy: 0.831
Epoch: 2/2..  Training Loss: 0.523..  Test Loss: 0.450..  Test Accuracy: 0.832
Epoch: 2/2..  Training Loss: 0.511..  Test Loss: 0.454..  Test Accuracy: 0.833
Epoch: 2/2..  Training Loss: 0.511..  Test Loss: 0.451..  Test Accuracy: 0.831
Epoch: 2/2..  Training Loss: 0.508..  Test Loss: 0.447..  Test Accuracy: 0.834
Epoch: 2/2..  Training Loss: 0.492..  Test Loss: 0.448..  Test Accuracy: 0.838
Epoch: 2/2..  Training Loss: 0.486..  Test Loss: 0.440..  Test Accuracy: 0.833
Epoch: 2/2..  Training Loss: 0.505..  Test Loss: 0.427..  Test Accuracy: 0.845
Epoch: 2/2..  Training Loss: 0.488..  Test Loss: 0.441..  Test Accuracy: 0.837

保存和加载模型

可以想象,在每次使用神经网络时都重新进行训练很不现实。因此,我们可以保存之前训练好的网络,并在继续训练或是进行预测时加载网络。

PyTorch 网络的参数都存储在模型的 state_dict 中。我们可以看到这个状态字典包含了每个层的权重和偏差矩阵。

print("Our model: \n\n", model, '\n')
print("The state dict keys: \n\n", model.state_dict().keys())
Our model: 

 Network(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=784, out_features=500, bias=True)
    (1): Linear(in_features=500, out_features=100, bias=True)
  )
  (output): Linear(in_features=100, out_features=10, bias=True)
  (dropout): Dropout(p=0.5)
) 

The state dict keys: 

 odict_keys(['hidden_layers.0.weight', 'hidden_layers.0.bias', 'hidden_layers.1.weight', 'hidden_layers.1.bias', 'output.weight', 'output.bias'])
# Our network: 

## Network((hidden_layers): ModuleList((0): Linear(in_features=784, out_features=500)
##    (1): Linear(in_features=500, out_features=100))
##  (output): Linear(in_features=100, out_features=10)
## ) 

# The state dict keys: 
# odict_keys(['hidden_layers.0.weight', 'hidden_layers.0.bias', 'hidden_layers.1.weight', 'hidden_layers.1.bias', 'output.weight', 'output.bias'])

最简单的做法是使用 torch.save 来保存状态字典。比如,我们可以将它保存到文件 'checkpoint.pth' 中。

torch.save(model.state_dict(), 'checkpoint.pth')

接着,我们可以使用 torch.load 来加载这个状态字典。

state_dict = torch.load('checkpoint.pth')
print(state_dict.keys())
odict_keys(['hidden_layers.0.weight', 'hidden_layers.0.bias', 'hidden_layers.1.weight', 'hidden_layers.1.bias', 'output.weight', 'output.bias'])
#odict_keys(['hidden_layers.0.weight', 'hidden_layers.0.bias', 'hidden_layers.1.weight', 'hidden_layers.1.bias', 'output.weight', 'output.bias'])

要将状态字典加载到神经网络中,你需要使用 model.load_state_dict(state_dict)'

model.load_state_dict(state_dict)

这看上去十分简单,但实际情况更加复杂。只有当模型结构与检查点的结构完全一致时,状态字典才能成功加载。如果我在创建模型时使用了不同的结构,便无法顺利加载。

# Try this
net = Network(784, 10, [400, 200, 100])
# This will throw an error because the tensor sizes are wrong!
net.load_state_dict(state_dict)
---------------------------------------------------------------------------

RuntimeError                              Traceback (most recent call last)

<ipython-input-22-74e14cc8e983> in <module>()
      2 net = Network(784, 10, [400, 200, 100])
      3 # This will throw an error because the tensor sizes are wrong!
----> 4 net.load_state_dict(state_dict)

/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py in load_state_dict(self, state_dict, strict)
    719         if len(error_msgs) > 0:
    720             raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
--> 721                                self.__class__.__name__, "\n\t".join(error_msgs)))
    722 
    723     def parameters(self):

RuntimeError: Error(s) in loading state_dict for Network:
    Missing key(s) in state_dict: "hidden_layers.2.weight", "hidden_layers.2.bias". 
    While copying the parameter named "hidden_layers.0.weight", whose dimensions in the model are torch.Size([400, 784]) and whose dimensions in the checkpoint are torch.Size([500, 784]).
    While copying the parameter named "hidden_layers.0.bias", whose dimensions in the model are torch.Size([400]) and whose dimensions in the checkpoint are torch.Size([500]).
    While copying the parameter named "hidden_layers.1.weight", whose dimensions in the model are torch.Size([200, 400]) and whose dimensions in the checkpoint are torch.Size([100, 500]).
    While copying the parameter named "hidden_layers.1.bias", whose dimensions in the model are torch.Size([200]) and whose dimensions in the checkpoint are torch.Size([100]).
---------------------------------------------------------------------------

RuntimeError                              Traceback (most recent call last)

~/miniconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/module.py in load_state_dict(self, state_dict, strict)
    481                 try:
--> 482                     own_state[name].copy_(param)
    483                 except Exception:

RuntimeError: inconsistent tensor size, expected tensor [400 x 784] and src [500 x 784] to have the same number of elements, but got 313600 and 392000 elements respectively at /Users/soumith/minicondabuild3/conda-bld/pytorch_1512381214802/work/torch/lib/TH/generic/THTensorCopy.c:86
During handling of the above exception, another exception occurred:

RuntimeError                              Traceback (most recent call last)

<ipython-input-18-74e14cc8e983> in <module>()
      2 net = Network(784, 10, [400, 200, 100])
      3 # This will throw an error because the tensor sizes are wrong!
----> 4 net.load_state_dict(state_dict)

~/miniconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/module.py in load_state_dict(self, state_dict, strict)
    485                                        'whose dimensions in the model are {} and '
    486                                        'whose dimensions in the checkpoint are {}.'
--> 487                                        .format(name, own_state[name].size(), param.size()))
    488             elif strict:
    489                 raise KeyError('unexpected key "{}" in state_dict'

RuntimeError: While copying the parameter named hidden_layers.0.weight, whose dimensions in the model are torch.Size([400, 784]) and whose dimensions in the checkpoint are torch.Size([500, 784]).

这意味着我们需要重建一个与训练时完全相同的模型。有关模型结构的信息需要与状态字典一起存储在检查点中。为了做到这一点,你需要构建一个字典,字典中包含重建模型的全部信息。

checkpoint = {'input_size': 784,
              'output_size': 10,
              'hidden_layers': [each.out_features for each in model.hidden_layers],
              'state_dict': model.state_dict()}

torch.save(checkpoint, 'checkpoint.pth')

现在,检查点中包含了重建训练模型所需的全部信息。你可以随意将它编写为函数。相似地,我们也可以编写一个函数来加载检查点。

def load_checkpoint(filepath):
    checkpoint = torch.load(filepath)
    model = Network(checkpoint['input_size'],
                    checkpoint['output_size'],
                    checkpoint['hidden_layers'])
    model.load_state_dict(checkpoint['state_dict'])

    return model
model = load_checkpoint('checkpoint.pth')
print(model)
Network(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=784, out_features=500, bias=True)
    (1): Linear(in_features=500, out_features=100, bias=True)
  )
  (output): Linear(in_features=100, out_features=10, bias=True)
  (dropout): Dropout(p=0.5)
)
"""

Network(
  (hidden_layers): ModuleList(
    (0): Linear(in_features=784, out_features=500)
    (1): Linear(in_features=500, out_features=100)
  )
  (output): Linear(in_features=100, out_features=10)
)
"""

为者常成,行者常至