优化器与损失函数

epoch和batch_size的选择

epoch的作用在于逐步提高模型的精度，直到达到一个相对稳定的水平。
batch size过小会导致模型训练过程不稳定，容易受到噪声数据干扰；而batch size过大则会导致模型训练时间过长。但batch size应该在设备允许的条件下尽可能的大，因为数量越多一个样本分布越接近真实值。

优化器optim

Adam和SGD通常是首选。Adam由于其自适应学习率特征，通常可以快速收敛，特别是在训练初期

SGD可能需要更细致的调优，但其优势在于训练后期可能会达到更好的性能。

alt text
gif路径

待解决问题

在解决cv问题时如何选择合适的优化器

小规模数据集和简单网络

A大grad，RMSprop也可能表现良好，这些优化器对每个参数的学习率进行自适应调整，有可能有助于更快的收敛
模型泛化

如果你关注模型的泛化能力，可能会考虑使用AdamW，他结合了Adam的自适应学习率和权重衰减，有助于防止过拟合
资源和效率

如果你的计算资源有限，或者需要快速迭代，可能会选择计算成本较低的优化器，如SGD和Adagrad
研究和比赛(不用在实践中)

在研究和比赛中，最佳实践是尝试多种优化器，并使用交叉验证来找到最佳配置。有时候，结合不同优化器的优点，如使用SGD进行预热，然后切换到Adam，也能带来提升

参考文献：随机梯度下降

三类梯度下降算法概述

GD(Gradient Descent)：就是没有利用Batch Size，用基于整个数据库得到梯度，梯度准确，但数据量大时，计算非常耗时，同时神经网络常是非凸的，网络最终可能收敛到初始点附近的局部最优点。
SGD(Stochastic Gradient Descent)：就是Batch Size = 1，每次计算一个样本，梯度不准确，所以学习率要降低。
mini-batch SGD：就是选择合适的Batch size算法，mini-batch利用噪声梯度，一定程度上缓解了GD算法直接掉进初始点附近的局部最优解。同时梯度准确了，学习率要加大。

凸：
- 指的是顺着梯度方向走到底就 一定是最优解 。
- 大部分传统机器学习问题 都是凸的。
非凸：
- 指的是顺着梯度方向走到底只能保证是局部最优，不能保证 是全局最优。
- 深度学习以及小部分传统机器学习问题都是非凸的。

最优化问题在机器学习中有非常重要的地位，很多机器学习算法都归结为求解最优化问题。在各种最优化算法中，梯度下降法是最简单、最常见的一种。

alt text

学习率衰减策略

固定步长衰减
多步长衰减
指数衰减
余弦退火
（拿代码去做实验）

损失函数

网络训练时常用Loss为交叉熵(衡量两个概率分布之间的距离概率分布 – 特征分布)

与mse的区别 sigmoid softmax

nn.crossentropyloss():多分类交叉熵等效 torch.nn.logsoftmax()

nn.NLLLoss():多分类交叉熵

nn.BCELoss():二分类交叉熵 ~ nn.sigmoid() 输入在 0~1

log：简化计算乘法变加法提供稳定性保证结果为正

nn.BCEWithLogLoss()：this loss combines ‘sigmoid’ layer and the ‘BCELoss’ in one single

centerloss

减小类内距

#中心点暂时无法确定，我们将中心点当成网络参数，让网络自己学习

import torch
from torch import nn
import torch.nn.functional as F
class CenterLoss(nn.Module):
    def __init__(self,cls_num,feature_num):
        super().__init__()
        self.cls_num = cls_num
        self.center = nn.Parameter(torch.randn(cls_num,feature_num))
    def forward(self,xs,ys):
        center_exp = self.center.index_select(dim=0, index=ys.long())
        count = torch.histc(ys, bins=self.cls_num, min=0, max=self.cls_num-1)
        count_exp = count.index_select(dim=0, index=ys.long())
        center_loss = torch.sum(torch.div(torch.sqrt(torch.sum(torch.pow(xs - center_exp, 2), dim=1)), count_exp))
        return center_loss
class ArcSoftmax(nn.Module):
    def __init__(self,feature_dim,cls_dim=10):
        super().__init__()
        self.W = nn.Parameter(torch.randn(feature_dim,cls_dim))

    def forward(self,feature,m=0.5,s=10):
        x = F.normalize(feature,dim=1)
        w = F.normalize(self.W,dim=0)
        cos = torch.matmul(x,w)/s
        a = torch.acos(cos)
        top = torch.exp(s*torch.cos(a+m))
        down2 = torch.sum(torch.exp(s*torch.cos(a)),dim=1,keepdim=True)-torch.exp(s*torch.cos(a))
        out = torch.log(top/(top+down2))

        return out
class MainNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden_layer = nn.Sequential(
            nn.Linear(28*28*1,512,bias=False),
            nn.BatchNorm1d(512),
            nn.Mish(),
            nn.Linear(512,256,bias=False),
            nn.BatchNorm1d(256),
            nn.Mish(),
            nn.Linear(256,128,bias=False),
            nn.BatchNorm1d(128),
            nn.Mish(),
            nn.Linear(128,2),


        )
        self.output_layer = ArcSoftmax(2, 10)
        self.center_loss_layer = CenterLoss(10,2)
        self.nllloss = nn.NLLLoss()
        # self.crossEntropyLoss = nn.CrossEntropyLoss()
    def forward(self,xs):
        features = self.hidden_layer(xs)
        outputs = self.output_layer(features)
        return features,outputs

    def getLoss(self,outputs,features,labels):
        loss_cls = self.nllloss(outputs, labels)
        loss_center = self.center_loss_layer(features,labels)

        loss = loss_cls + loss_center
        return loss
    
def train(net):
    loss_func = nn.CrossEntropyLoss()
    opt = torch.optim.Adam(net.parameters())
    opt = torch.optim.SGD(net.parameters())
    EPOCH = 500
    for epoch in range(EPOCH):
        feature_loader = []
        labels_loader = []
        for i,(img,label) in enumerate(train_loader):
            img = img.reshape(-1,28*28).to(DEVICE)
            # label_ = one_hot(label,10).to(DEVICE).float()
            label_ = label.to(DEVICE)

            feature,cls_out = net(img)

            loss = net.getLoss(cls_out,feature,label_)

            opt.zero_grad()
            loss.backward()
            opt.step()

            feature_loader.append(feature)
            labels_loader.append(label)

            
        print(f"epoch:{epoch},loss:{loss.item()}")
        features = torch.cat(feature_loader,0)
        labels = torch.cat(labels_loader)
        if epoch%10 ==0:
            visualize(features.data.cpu().numpy(),labels.data.cpu().numpy(),epoch=epoch+100)
net = MainNet().to(DEVICE)
train(net)

arcsoftmax

增大类间距

import torch
import torch.nn as nn
import torch.nn.functional as F
class ArcSoftmax(nn.Module):
    def __init__(self,feature_dim,cls_dim=10):
        super().__init__()
        self.W = nn.Parameter(torch.randn(feature_dim,cls_dim))

    def forward(self,feature,m=0.5,s=10):
        x = F.normalize(feature,dim=1)
        w = F.normalize(self.W,dim=0)
        cos = torch.matmul(x,w)/s
        a = torch.acos(cos)
        top = torch.exp(s*torch.cos(a+m))
        down2 = torch.sum(torch.exp(s*torch.cos(a)),dim=1,keepdim=True)-torch.exp(s*torch.cos(a))
        out = torch.log(top/(top+down2))

        return out
class MainNet(nn.Module):
    def __init__(self):
        super(MainNet, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(32),
            nn.Mish(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.Mish(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(128),
            nn.Mish(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.fc_layers = nn.Sequential(
            nn.Linear(128 * 3 * 3, 256, bias=False),
            nn.BatchNorm1d(256),
            nn.Mish(),
            nn.Linear(256, 128, bias=False),
            nn.BatchNorm1d(128),
            nn.Mish(),
            nn.Linear(128, 64, bias=False),
            nn.BatchNorm1d(64),
            nn.Mish(),
            nn.Linear(64, 32, bias=False),
            nn.BatchNorm1d(32),
            nn.Mish(),
            nn.Linear(32, 2)
        )

        self.output_layer = ArcSoftmax(2, 10)
        self.nllloss = nn.NLLLoss()

    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)
        features = self.fc_layers(x)
        outputs = self.output_layer(features)
        return features, outputs

    def getLoss(self, outputs, labels):
        loss = self.nllloss(outputs, labels)
        return loss        
class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.input_layer = nn.Sequential(
            nn.Conv2d(1,64,5,bias=False),   # 64 ,24 * 24
            nn.BatchNorm2d(64),
            nn.Mish(),
            nn.Conv2d(64,128,5,bias=False), # 128,20,20
            nn.BatchNorm2d(128),
            nn.Mish(),
            nn.Conv2d(128,64,5,bias=False), #  64 16,16
            nn.BatchNorm2d(64),
            nn.Mish(),
            nn.Conv2d(64,32,5,bias=False),  #  32,12,12
            nn.BatchNorm2d(32),
            nn.Mish(),
            
        )
        self.hidden_layer = nn.Sequential(
            nn.Linear(32*12*12,12*12),
            nn.BatchNorm1d(12*12),
            nn.Mish(),
            nn.Linear(12*12,6*6),
            nn.BatchNorm1d(36),
            nn.Mish(),
            nn.Linear(36,2),
        )

        self.ArcSoftmax_layer = ArcSoftmax(2,10)
        self.nllLoss = nn.NLLLoss()

    def forward(self, input):
        x = self.input_layer(input)
        feature = self.hidden_layer(x.view(-1,32*12*12))
        output = self.ArcSoftmax_layer(feature.view(-1,2))
        return feature,output
    
    def getLoss(self,output,labels):
        loss = self.nllLoss(output,labels)
        return loss
    
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = MainNet().to(DEVICE)
# net.eval()
# input = torch.randn(1,1,28,28).to(DEVICE)
# net(input)

特诊空间分类越好，分类效果越好