pytorch学习

2024-04-29

字数统计: 2.7k | 阅读时长≈ 12 分钟

pytorch学习

学习参考：动手学深度学习

1 基础

损失函数（Loss）

定义方法:

L1Loss: $loss(y,y’)= | y’-y |$
L2Loss: $loss(y,y’)=\frac{1}{2}(y’-y)^2$
Huber’s Robust Loss: $loss(y’,y)=\begin{cases}
| y’-y |-\frac{1}{2},\mathrm{if} | y’-y |>1 \
\frac{1}{2}(y’-y)^2,\mathrm{otherwise}
\end{cases}$

网络层

展平层：（可以reshape输入输出，将其展开成一个向量）nn.Flatten()

激活函数

sigmoid: $\sigma (x)=\frac{1}{1+exp(-x)}$
Tanh: $\sigma (x)=\frac{1-exp(-2x)}{1+exp(-2x)}$
ReLU: $\sigma (x)=max(x,0)$

2 框架

2.1 模型构造：Module&Sequential

Sequential可以用来连接各种Module及基于其写的类（可能是层，也可能是网络）。
对于一个自己写出的层，可以随意定义前向传播函数（即如何进行网络计算），其反向求导都可以进行。

import torch
from torch import nn
from torch.nn import functional as F

X=torch.rand(2,20)

class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden=nn.Linear(20,256)
        self.out=nn.Linear(256,10)

    def forward(self,X):
        return self.out(F.relu(self.hidden(X)))

class MySequential(nn.Module):
    def __init__(self,*args):#此处用指针传了一个列进来
        super().__init__()
        for index,block in enumerate(args):
            #enumerate()对一个列返回其Index和列中元素
            self._modules[index]=block

    def forward(self,X):
        for module in self._modules.values():
            X=module(X)
        return X

class FixedHiddenMLP(nn.Module):
    def __init__(self):
        super().__init__()
        # 不计算梯度的随机权重参数。因此其在训练期间保持不变
        self.rand_weight = torch.rand((20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)

    def forward(self, X):#这里主要是想说forward是自由的（
        X = self.linear(X)
        # 使用创建的常量参数以及relu和mm函数
        X = F.relu(torch.mm(X, self.rand_weight) + 1)
        # 复用全连接层。这相当于两个全连接层共享参数
        X = self.linear(X)
        # 控制流
        while X.abs().sum() > 1:
            X /= 2
        return X.sum()

class NestMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(20, 64), nn.ReLU(),
                                 nn.Linear(64, 32), nn.ReLU())
        self.linear = nn.Linear(32, 16)

    def forward(self, X):
        return self.linear(self.net(X))
    #这里主要是说明，需要看forward函数才能知道这个网络具体是怎么个顺序和计算方法）

#sequential可以用来嵌套各种块，层+网络，网络＋网络，层＋层，无论什么都可以（只要基于module）
chimera = nn.Sequential(NestMLP(), nn.Linear(16, 20), FixedHiddenMLP())
print(chimera(X))

2.2 参数管理

2.2.1 参数访问


import torch
from torch import nn

net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size=(2, 4))
net(X)

#可以直接通过索引得到所有参数（没有参数的不行
print(net[2].state_dict())

#进一步地，对于不同的层，可以直接得到其参数
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)#还有.bias.grad

#也可以一次性访问许多参数，其中*在print时作为解包存在（即把元素从列表拿出来
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])
#.named_parameters()：这是PyTorch中Module类的一个方法，它返回一个迭代器，其中包含模块的所有参数以及它们的名称。
#每次迭代返回一个(name, parameter)对，其中name是参数的名称，parameter是参数的Tensor对象。

net.state_dict()['2.bias'].data
#net.state_dict()：这是PyTorch中Module类的一个方法，它返回一个有序字典（OrderedDict），其中包含了神经网络中所有参数的键值对。
#字典的键是参数的名称，通常是模块的名称和参数的名称组合而成，例如'layer_name.weight'或'layer_name.bias'。字典的值是参数的Tensor对象。

#对于嵌套状态下的块，可以一层一层调用出来
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                         nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
        # 在这里嵌套
        net.add_module(f'block {i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))

print(rgnet[0][1][0].bias.data)

2.2.2 参数初始化



def init_normal(m):#此处的m是一个Module
    if type(m) == nn.Linear:
        #都是nn自带的init
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)

def init_constant(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 1)
        nn.init.zeros_(m.bias)

def init_xavier(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
def init_42(m):
    if type(m) == nn.Linear:
        nn.init.constant_(m.weight, 42)

def my_init(m):#同样的，我们始终可以自己自定义一些初始化方法。
    if type(m) == nn.Linear:
        print("Init", *[(name, param.shape)
                        for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5

net.apply(init_normal)#只要是Module的函数都可以apply，会自动进行多层的递归操作的
netnet[0].apply(init_xavier)#也可以单独对某个Module进行初始化。
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]
#可以使用暴力重载的方法初始化参数。

2.2.3 参数绑定

有时我们希望在多个层间共享参数。我们可以定义一个层，然后多次使用它。

# 我们需要给共享层一个名称，以便可以引用它的参数
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    shared, nn.ReLU(),
                    shared, nn.ReLU(),#直接将两层重复使用即可，两层会一直绑定的。
                    nn.Linear(8, 1))
net(X)
# 检查参数是否相同
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100
# 确保它们实际上是同一个对象，而不只是有相同的值
print(net[2].weight.data[0] == net[4].weight.data[0])

2.3 自定义层

import torch
import torch.nn.functional as F
from torch import nn

#自定义了一个层，该层的运算是将所有输入的平均值变为0
class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, X):
        return X - X.mean()

#此处自定义一个带参数的层，同样的，其计算方法定义在forward里面。
class MyLinear(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, units))
        self.bias = nn.Parameter(torch.randn(units,))
    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)

2.4 读写

import torch
from torch import nn
from torch.nn import functional as F

class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(20, 256)
        self.output = nn.Linear(256, 10)

    def forward(self, x):
        return self.output(F.relu(self.hidden(x)))

net = MLP()
X = torch.randn(size=(2, 20))
Y = net(X)

#torch.save和torch.load可以用于存储所有参数
torch.save(net.state_dict(), 'mlp.params')
clone = MLP()
clone.load_state_dict(torch.load('mlp.params'))
clone.eval()

2.5 GPU

import torch
from torch import nn

def try_gpu(i=0):  #@save
    """如果存在，则返回gpu(i)，否则返回cpu()"""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')

def try_all_gpus():  #@save
    """返回所有可用的GPU，如果没有GPU，则返回[cpu(),]"""
    devices = [torch.device(f'cuda:{i}')
             for i in range(torch.cuda.device_count())]
    return devices if devices else [torch.device('cpu')]

X = torch.ones(2, 3, device=try_gpu())#或者直接写'cuda:0/1/2'就行了
Z = X.cuda(1)#可以在GPU之间挪动变量。
net = nn.Sequential(nn.Linear(3, 1))
net = net.to(device=try_gpu())#在GPU之间挪动网络可以使用.to

3 线性回归

对于标准深度学习模型，我们可以使用框架的预定义好的层。这使我们只需关注使用哪些层来构造模型，而不必关注层的实现细节。我们首先定义一个模型变量net，它是一个Sequential类的实例。 Sequential类将多个层串联在一起。当给定输入数据时，Sequential实例将数据传入到第一层，然后将第一层的输出作为第二层的输入，以此类推。在下面的例子中，我们的模型只包含一个层，因此实际上不需要Sequential。但是由于以后几乎所有的模型都是多层的，在这里使用Sequential会让你熟悉“标准的流水线”。

PyTorch中，全连接层在Linear类中定义。值得注意的是，我们将两个参数传递到nn.Linear中。第一个指定输入特征形状，第二个指定输出特征形状，输出特征形状为单个标量.

此处以回归方程$y=Xw+b$为例，其中$X=[x_1,x_2],w=[w_1,w_2]^T$

import numpy as np
import torch
from torch.utils import data
from d2l import torch as d2l

from torch import nn

true_w = torch.tensor([2, -3.4])
true_b = 4.2
features, labels = d2l.synthetic_data(true_w, true_b, 1000)

def load_array(data_arrays, batch_size, is_train=True):  #@save
    """构造一个PyTorch数据迭代器"""
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

batch_size = 10
data_iter = load_array((features, labels), batch_size)

net=nn.Sequential(nn.Linear(2,1))
net[0].weight.data.normal_(0,0.01)
net[0].bias.data.fill_(0)

loss=nn.MSELoss()

trainer=torch.optim.SGD(net.parameters(),lr=0.03)

num_epochs = 3
for epoch in range(num_epochs):
    for X, y in data_iter:
        l = loss(net(X) ,y)
        trainer.zero_grad()
        l.backward()
        trainer.step()
    l = loss(net(features), labels)
    print(f'epoch {epoch + 1}, loss {l:f}')

此外，还有softmax、多层感知机等内容，此处不再描述，细节在笔记中（因为发现和pytorch关系不大）

4 卷积网络

具体推导见笔记。

4.1 图像卷积

使用了一个简单的图像。

import torch
from torch import nn
from d2l import torch as d2l
import numpy as np

def corr2d(X,K):
    k_h,k_w=K.shape
    Y=torch.zeros(X.shape[0]-k_h+1,X.shape[1]-k_w+1)
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            Y[i,j]=(X[i:i+k_h,j:j+k_w]*K).sum()
    return Y

class Cov2d(nn.Module):
    def __init__(self,k_size):
        super().__init__()
        self.weight=nn.parameter(torch.rand(k_size))
        self.bias=nn.parameter(torch.zeros(1))

    def forward(self,X):
        return corr2d(X,self.weight)+self.bias

X=torch.ones(6,8)
X[:,2:6]=0
K = torch.tensor([[1.0, -1.0]])
Y=corr2d(X,K)
print(X,Y)

#前两个1是通道，kernel是核
net=nn.Conv2d(1,1,kernel_size=(1,2),bias=False)
X=X.reshape((1,1,6,8))
Y=Y.reshape((1,1,6,7))
lr=1e-2
episode=20

for i in range(episode):
    Y_=net(X)
    loss=(Y_-Y)**2
    net.zero_grad()
    loss.sum().backward()

    net.weight.data[:]-=lr*net.weight.grad
    if (i + 1) % 2 == 0:
        print(f'epoch {i + 1}, loss {loss.sum():.3f}')

print(net.weight.data)

4.1.1 填充和步幅

由于卷积会使得维度大幅下降，考虑到深度和边缘检测的问题，在边缘加上空白行列。
同样地，对于一些很大维度的图片，考虑增大步幅可以实现较快的维度下降。

1
2
3

conv2d = nn.Conv2D(1, kernel_size=(3, 5), padding=(0, 1), strides=(3, 4))
#padding即在行列上下（左右）分别填充多少，strides指步幅（行，列）
comp_conv2d(conv2d, X).shape

4.1.2 多输入多输出卷积（多通道卷积）

*此外还有核为11的卷积，不再赘述，具体看笔记。

4.1.3 池化层

由于卷积对位置很敏感，用池化层可以虚化对位置的敏感情况。

def pool2d(X,pool_size,mode='max'):
    p_h,p_w=pool_size
    Y=np.zeros((X.shape[0]-p_h+1,X.shape[1]-p_w+1))
    for i in range(Y.shape[0]):
        for j in range(Y.shape[1]):
            if mode=='max':
                Y[i,j]=X[i:i+p_h,j:j+p_w].max()
            if mode=='mean':
                Y[i,j]=X[i:i+p_h,j:j+p_w].mean()
    return Y

版权声明： 本博客所有文章除特别声明外，著作权归作者所有。转载请注明出处！