尝试使用 pytorch 编写 yolo1，但在训练时得到 NaN-解网

问：

拜托，我想知道我的模型或损失函数或数据集是否错误。

我尝试打印了很多东西进行调试，但仍然不知道问题出在哪里。

我的源代码是：

import time
import torch
import torch.nn as nn
import cv2
import os
import numpy as np
from PIL import Image
from tqdm import tqdm
from torchvision import transforms
import torchvision
import torch.optim.lr_scheduler as lr_scheduler
NC = 102  # 类别数
BC = 2  # 锚框数
CLASSES = []

transform = transforms.Compose([
    transforms.Resize(448, antialias=True),  # 缩放
    transforms.RandomCrop(448),  # 随机裁剪
    transforms.RandomHorizontalFlip(),  # 水平翻转
    transforms.RandomVerticalFlip(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),  # 标准化
])


class Dataset(torch.utils.data.Dataset):
    def __init__(self, data_path, label_path):
        self.data_path = data_path
        self.label_path = label_path
        self.data = []
        for i in tqdm(os.listdir(data_path)):
            img = os.path.join(data_path, i)
            label = os.path.join(label_path, i.split('.')[0] + '.txt')
            self.data.append([img, label])

    def __getitem__(self, index):
        img = cv2.imread(self.data[index][0])
        # img = Image.open(self.data[index][0]).convert('RGB')
        h, w = img.shape[0], img.shape[1]
        padh, padw = 0, 0
        # if h > w:
        #     padw = (h - w) // 2
        #     img = np.pad(img, ((0, 0), (padw, h - w - padw), (0, 0)), 'constant', constant_values=0)
        # elif h < w:
        #     padh = (w - h) // 2
        #     img = np.pad(img, ((padh, w - h - padh), (0, 0), (0, 0)), 'constant', constant_values=0)
        img = cv2.resize(img, (448, 448))
        img = torch.from_numpy(np.array(img)).permute(2, 0, 1).float() / 255.0

        label = open(self.data[index][1]).readlines()
        label = [[float(j) for j in i.strip().split()] for i in label]
        for i in range(len(label)):
            if len(label[i]) % 5 != 0:
                raise ValueError('label error')
            if padw != 0:
                label[i][2] = (label[i][2] * h + padh) / h
                label[i][3] = (label[i][3] * w + h - w) / h
            if padh != 0:
                label[i][1] = (label[i][1] * w + padw) / w
                label[i][4] = (label[i][4] * h + w - h) / w

        # label = [x for y in label for x in y]  # 把label拉成一个列表

        # 进行了左侧和上边的填充， 改变图片大小，那么原来的框位置就出现了偏移，也需要相应修改
        # print(label)
        label = deal_label(label)
        label = torch.tensor(label).float()
        return img, label, self.data[index][0]

    def __len__(self):
        return len(self.data)


def deal_label(label_list):
    # [ # label_list
    #     [label,x, y, w, h]
    # ]
    gridsize = 1.0 / 7  # 7最终划分网格数
    labels = np.zeros((7, 7, (NC + 5 * BC)))  # （x网格数，y网格数，5*锚框数+类别数）
    for i in label_list:
        gridx = int(i[1] // gridsize)
        gridy = int(i[2] // gridsize)
        labels[gridx, gridy, 5 * 2 + int(i[0])] = 1
        labels[gridx, gridy, 0:5] = np.array([1, i[1], i[2], i[3], i[4]])  # confi,x,y,w,h
        labels[gridx, gridy, 5:10] = np.array([1, i[1], i[2], i[3], i[4]])

    return labels


data = Dataset(r'C:\Users\xingshuyin\python\datasets\disease_insect_detect\images\train',
               r'C:\Users\xingshuyin\python\datasets\disease_insect_detect\labels\train')


def collate_fn(betch):
    data = [item[0] for item in betch]
    label = [item[1] for item in betch]
    files = [item[2] for item in betch]
    return torch.stack(data), torch.stack(label), files


loader = torch.utils.data.DataLoader(data, batch_size=8, shuffle=True, num_workers=2, collate_fn=collate_fn)
loader_val = torch.utils.data.DataLoader(data, batch_size=1, shuffle=True, num_workers=1, collate_fn=collate_fn)


def iou(b1, b2):
    """
    b1 -> batch S S (NC + 5 * BC)
    (NC + 5 * BC) -> confi cx cy w h confi cx cy w h cls1 cls2 ...

    b2 -> batch S S (NC + 5 * BC)
    """
    b11 = b1[0:5]
    b12 = b1[5:10]
    b21 = b2[0:5]
    b22 = b2[5:10]
    b11_area = b11[3] * b11[4]
    b12_area = b12[3] * b12[4]
    b21_area = b21[3] * b21[4]
    b22_area = b22[3] * b22[4]

    intersect_x_b11_b21 = torch.min(b11[1] + b11[3] / 2, b21[1] + b21[3] / 2) - torch.max(b11[1] - b11[3] / 2, b21[1] - b21[3] / 2)
    intersect_y_b11_b21 = torch.min(b11[2] + b11[4] / 2, b21[2] + b21[4] / 2) - torch.max(b11[2] - b11[4] / 2, b21[2] - b21[4] / 2)
    intersect_area_b11_b21 = intersect_x_b11_b21 * intersect_y_b11_b21

    intersect_x_b12_b22 = torch.min(b12[1] + b12[3] / 2, b22[1] + b22[3] / 2) - torch.max(b12[1] - b12[3] / 2, b22[1] - b22[3] / 2)
    intersect_y_b12_b22 = torch.min(b12[2] + b12[4] / 2, b22[2] + b22[4] / 2) - torch.max(b12[2] - b12[4] / 2, b22[2] - b22[4] / 2)
    intersect_area_b12_b22 = intersect_x_b12_b22 * intersect_y_b12_b22

    iou1 = intersect_area_b11_b21 / (b11_area + b21_area - intersect_area_b11_b21)
    iou2 = intersect_area_b12_b22 / (b12_area + b22_area - intersect_area_b12_b22)

    # print('b11 ', b11, 'b11_area ', b11_area)
    # print('b12 ', b12, 'b12_area ', b12_area)
    # print('b21 ', b21, 'b21_area ', b21_area)
    # print('b22 ', b22, 'b22_area ', b22_area)
    print('iou1 ', iou1, 'iou2 ', iou2)
    return iou1 if iou1 > 0 else 0, iou2 if iou2 > 0 else 0


class YOLOLOSS(nn.Module):
    def __init__(self, S):
        super(YOLOLOSS, self).__init__()
        self.S = S  # 栅格数量
        self.weight_xy = 5.0  # 坐标损失权重
        self.weight_wh = 5.0  # 大小损失权重
        self.weight_noobj_confi = 0.5  # 不包含物体的置信度损失权重
        self.weight_obj_confi = 1  # 包含物体的置信度损失权重
        self.weight_class = 1

    def forward(self, pred, label):
        """
        pred -> batch S S (NC + 5 * BC)
        (NC + 5 * BC) -> confi cx cy w h confi cx cy w h cls1 cls2 ...

        labal -> batch S S (NC + 5 * BC)
        """
        batch_size = pred.size(0)

        loss_xy_obj = 0
        loss_wh_obj = 0
        loss_obj_confi = 0
        loss_noobj_confi = 0
        loss_class = 0
        for i in range(batch_size):
            for x in range(self.S):
                for y in range(self.S):
                    if label[i, x, y, 0] == 1 and label[i, x, y, 5] == 1:  # 如果label在这个各自有值，这个各自就负责预测这个格子的label
                        iou1, iou2 = iou(pred[i, x, y], label[i, x, y])
                        if iou1 >= iou2:
                            loss_xy_obj += self.weight_xy * torch.sum((pred[i, x, y, 1:3] - label[i, x, y, 1:3])**2)  # 公式第一行
                            loss_wh_obj += self.weight_wh * torch.sum((pred[i, x, y, 3:5].sqrt() - label[i, x, y, 3:5].sqrt())**2)  # 公式第二行
                            loss_obj_confi += self.weight_obj_confi * torch.sum((pred[i, x, y, 0] - iou1)**2)  # 公式第三行
                            loss_noobj_confi += self.weight_noobj_confi * torch.sum((pred[i, x, y, 5] - iou2)**2)  # 公式第四行
                        else:
                            loss_xy_obj += self.weight_xy * torch.sum((pred[i, x, y, 6:8] - label[i, x, y, 6:8])**2)
                            loss_wh_obj += self.weight_wh * torch.sum((pred[i, x, y, 8:10].sqrt() - label[i, x, y, 8:10].sqrt())**2)
                            loss_obj_confi += self.weight_obj_confi * torch.sum((pred[i, x, y, 5] - iou2)**2)
                            loss_noobj_confi += self.weight_noobj_confi * torch.sum((pred[i, x, y, 0] - iou1)**2)
                        loss_class += self.weight_class * torch.sum((pred[i, x, y, 10:] - label[i, x, y, 10:])**2)  # 公式第5行
                    else:
                        loss_noobj_confi += self.weight_noobj_confi * (torch.sum(pred[i, x, y, [0, 5]]**2))  # 公式第四行
        print('loss ------------', 'loss_xy_obj ', loss_xy_obj, 'loss_wh_obj ', loss_wh_obj, 'loss_obj_confi ',
              loss_obj_confi, 'loss_noobj_confi ', loss_noobj_confi, 'loss_class ', loss_class)
        return (loss_xy_obj + loss_wh_obj + loss_obj_confi + loss_noobj_confi + loss_class) / batch_size


class YOLO(torch.nn.Module):
    def __init__(self):
        super(YOLO, self).__init__()
        self.net = nn.Sequential(
            nn.Conv2d(3, 64, 7, 2, 3),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(64, 192, 3, 1, 1),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(192, 128, 1, 1, 0),  # 论文里没有padding 因为都是same ，所以大小为3的卷积核就需要padding=1
            nn.Conv2d(128, 256, 3, 1, 1),
            nn.Conv2d(256, 256, 1, 1, 0),
            nn.Conv2d(256, 512, 3, 1, 1),
            nn.MaxPool2d(2, 2),

            nn.Conv2d(512, 256, 1, 1, 0),
            nn.Conv2d(256, 512, 3, 1, 1),
            nn.Conv2d(512, 256, 1, 1, 0),
            nn.Conv2d(256, 512, 3, 1, 1),
            nn.Conv2d(512, 256, 1, 1, 0),
            nn.Conv2d(256, 512, 3, 1, 1),
            nn.Conv2d(512, 256, 1, 1, 0),
            nn.Conv2d(256, 512, 3, 1, 1),


            # nn.Conv2d(512, 512, 1),
            nn.Conv2d(512, 1024, 3, 1, 1),
            nn.MaxPool2d(2, 2),
            nn.Conv2d(1024, 512, 1, 1, 0),
            nn.Conv2d(512, 1024, 3, 1, 1),
            nn.Conv2d(1024, 512, 1, 1, 0),
            nn.Conv2d(512, 1024, 3, 1, 1),
            nn.LeakyReLU(),


            # nn.Conv2d(1024, 1024, 3),
            nn.Conv2d(1024, 1024, 3, 2, 1),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(),
            nn.Conv2d(1024, 1024, 3, 1, 1),
            nn.LeakyReLU(),
            nn.Conv2d(1024, 1024, 3, 1, 1),
            nn.LeakyReLU(),
        )
        self.net2 = nn.Sequential(
            nn.Linear(7 * 7 * 1024, 4096),
            nn.LeakyReLU(),
            nn.Linear(4096, 7 * 7 * (NC + 5 * BC)),
            nn.Sigmoid()
        )

    def forward(self, x):
        """
        x -> betch S S (NC + 5 * BC)
        """
        # TODO
        out = self.net(x)
        out = self.net2(out.view(out.size(0), -1))
        # out = out * 100
        return out.view(-1, 7, 7, (NC + 5 * BC))  # 返回的数据包含betxh


def train():
    m = YOLO()  # 创建模型
    m.to('cuda')  # 把模型放到cuda上
    loss = YOLOLOSS(7)  # 创建损失函数
    lr = 0.01
    opt = torch.optim.SGD(m.parameters(), lr=lr, momentum=0.9, weight_decay=0.0005)  # 创建优化器
    # scheduler = lr_scheduler.CosineAnnealingWarmRestarts(opt, T_0=100, T_mult=5, eta_min=0.001)
    loss_min = 9999
    for E in range(5):
        pbar = tqdm(loader)
        for i in pbar:
            m.zero_grad()  # 梯度清零, 不然数据会累加
            data, label, files = i[0].to('cuda'), i[1].to('cuda'), i[2]
            print(data.size())
            r = m(data)
            # print(r.size())
            # 神经网络可视化
            # g = make_dot(r ,params=dict(m.named_parameters()), show_attrs=True, show_saved=True)
            # g.render(filename='graph', view=False)
            print(data[0][0][0])
            print(label[0][0][0])
            print(r[0][0][0])
            l = loss(r, label)  # 计算loss
            pbar.set_description(F"epoch {E} | loss_min: {loss_min} | loss: {l} | lr:{lr} ")
            # print(files)
            # loss.grad_fn.next_functions[0][0].next_functions 可以逐级获取传播过程
            if l < loss_min and l < 5:
                loss_min = l
                torch.save(m.state_dict(), './model')

            l.backward()  # 反向传播
            opt.step()  # 更新参数
            # scheduler.step()
            lr = opt.param_groups[0]['lr']
            time.sleep(0.5)


def pred_to_box(pred):
    boxs = []
    for i in range(len(pred)):
        b = []
        for j in range(len(pred[i])):
            b.append(torch.cat(pred[i][j][:5], pred[i][j][10:]))
            b.append(torch.cat(pred[i][j][5:10], pred[i][j][10:]))
        boxs.append(b)
    return boxs


def draw_box(img, boxs):
    ...


def val():
    m = YOLO()
    m.load_state_dict(torch.load('./model'))
    m.eval()

    for i in tqdm(loader_val):
        data, label, files = i[0].to('cuda'), i[1].to('cuda'), i[2]
        r = m(data)
        date = []


if __name__ == '__main__':
    torch.set_printoptions(profile="full")
    train()

运行程序时，我得到此输出，其中显示了第 3 个数据。nantensor

loss ------------ loss_xy_obj  tensor(24.2689, device='cuda:0', grad_fn=<AddBackward0>) loss_wh_obj  tensor(16.9818, device='cuda:0', grad_fn=<AddBackward0>) loss_obj_confi  tensor(4.2992, device='cuda:0', grad_fn=<AddBackward0>) loss_noobj_confi  tensor(3.0024e-05, device='cuda:0', grad_fn=<AddBackward0>) loss_class  tensor(67.2501, device='cuda:0', grad_fn=<AddBackward0>)
epoch 0 | loss_min: 9999 | loss: 14.099996566772461 | lr:0.01 :   0%|▊                                                                                                                                                                              | 9/2024 [00:10<27:03,  1.24it/s]torch.Size([8, 3, 448, 448])
tensor([0.3020, 0.3020, 0.3020, 0.2980, 0.2941, 0.2941, 0.2941, 0.2941, 0.2941,
        0.2902, 0.2863, 0.2824, 0.2784, 0.2745, 0.2784, 0.2784, 0.2784, 0.2745,
        0.2667, 0.2627, 0.2627, 0.2549, 0.2549, 0.2588, 0.2588, 0.2627, 0.2667,
        0.2706, 0.2667, 0.2549, 0.2627, 0.2706, 0.2706, 0.2667, 0.2588, 0.2549,
        0.2706, 0.2706, 0.2706, 0.2706, 0.2706, 0.2706, 0.2706, 0.2706, 0.2706,
        0.2706, 0.2706, 0.2706, 0.2706, 0.2706, 0.2745, 0.2745, 0.2745, 0.2745,
        0.2745, 0.2745, 0.2745, 0.2706, 0.2706, 0.2706, 0.2745, 0.2745, 0.2745,
        0.2784, 0.2863, 0.2902, 0.2863, 0.2824, 0.2863, 0.2902, 0.2980, 0.3020,
        0.2980, 0.2980, 0.2980, 0.2980, 0.2980, 0.2980, 0.2980, 0.3059, 0.3059,
        0.3059, 0.3098, 0.3137, 0.3137, 0.3137, 0.3059, 0.3020, 0.3020, 0.3020,
        0.3020, 0.3098, 0.3137, 0.3020, 0.3059, 0.3098, 0.3137, 0.3137, 0.3137,
        0.3098, 0.3098, 0.3098, 0.3098, 0.3098, 0.3098, 0.3098, 0.3098, 0.3098,
        0.3098, 0.3098, 0.3098, 0.3098, 0.3098, 0.3098, 0.3137, 0.3216, 0.3216,
        0.3216, 0.3216, 0.3216, 0.3216, 0.3216, 0.3216, 0.3216, 0.3216, 0.3216,
        0.3216, 0.3216, 0.3216, 0.3255, 0.3216, 0.3216, 0.3176, 0.3176, 0.3216,
        0.3255, 0.3176, 0.3176, 0.3176, 0.3176, 0.3176, 0.3176, 0.3176, 0.3176,
        0.3176, 0.3176, 0.3176, 0.3176, 0.3176, 0.3176, 0.3176, 0.3176, 0.3176,
        0.3137, 0.3137, 0.3176, 0.3176, 0.3216, 0.3216, 0.3216, 0.3176, 0.3176,
        0.3137, 0.3137, 0.3137, 0.3137, 0.3137, 0.3137, 0.3137, 0.3137, 0.3137,
        0.3137, 0.3059, 0.3059, 0.3098, 0.3137, 0.3137, 0.3216, 0.3216, 0.3137,
        0.3137, 0.3137, 0.3137, 0.3137, 0.3137, 0.3137, 0.3137, 0.3137, 0.3137,
        0.3137, 0.3137, 0.3137, 0.3137, 0.3137, 0.3137, 0.3137, 0.3098, 0.3098,
        0.3137, 0.3137, 0.3137, 0.3098, 0.3098, 0.3098, 0.3098, 0.3098, 0.3098,
        0.3098, 0.3176, 0.3137, 0.3098, 0.3098, 0.3098, 0.3137, 0.3176, 0.3137,
        0.3137, 0.3137, 0.3137, 0.3137, 0.3137, 0.3137, 0.3098, 0.3098, 0.3098,
        0.3098, 0.3098, 0.3098, 0.3098, 0.3176, 0.3176, 0.3098, 0.3098, 0.3098,
        0.3098, 0.3176, 0.3137, 0.3059, 0.3059, 0.3020, 0.3020, 0.3059, 0.3059,
        0.3098, 0.3059, 0.3059, 0.3059, 0.3059, 0.3059, 0.2980, 0.2980, 0.2941,
        0.2941, 0.2902, 0.2863, 0.2784, 0.2784, 0.2784, 0.2980, 0.2980, 0.2980,
        0.2941, 0.2902, 0.2902, 0.2902, 0.2784, 0.2824, 0.2824, 0.2863, 0.2863,
        0.2902, 0.2902, 0.2902, 0.2902, 0.2902, 0.2902, 0.2902, 0.2902, 0.2902,
        0.2902, 0.2902, 0.2902, 0.2902, 0.2902, 0.2902, 0.2902, 0.2902, 0.2902,
        0.2824, 0.2745, 0.2745, 0.2784, 0.2863, 0.2902, 0.2784, 0.2784, 0.2745,
        0.2745, 0.2706, 0.2706, 0.2706, 0.2627, 0.2667, 0.2745, 0.2784, 0.2745,
        0.2706, 0.2627, 0.2627, 0.2627, 0.2627, 0.2627, 0.2627, 0.2627, 0.2667,
        0.2784, 0.2863, 0.2863, 0.2863, 0.2824, 0.2980, 0.3725, 0.4431, 0.4980,
        0.5412, 0.5608, 0.5608, 0.5529, 0.5608, 0.5569, 0.5373, 0.4941, 0.4275,
        0.3490, 0.2706, 0.2078, 0.1725, 0.1922, 0.2078, 0.2196, 0.2157, 0.1922,
        0.1608, 0.1412, 0.1255, 0.1333, 0.1412, 0.1490, 0.1529, 0.1490, 0.1451,
        0.1373, 0.1294, 0.1255, 0.1216, 0.1137, 0.1255, 0.1333, 0.1569, 0.1765,
        0.1882, 0.1922, 0.1804, 0.1686, 0.1569, 0.1569, 0.1647, 0.1686, 0.1647,
        0.1529, 0.1490, 0.1529, 0.1686, 0.1647, 0.1647, 0.1608, 0.1647, 0.1686,
        0.1804, 0.1961, 0.2510, 0.2667, 0.2863, 0.2941, 0.2941, 0.2784, 0.2667,
        0.2627, 0.2588, 0.2588, 0.2471, 0.2353, 0.2196, 0.2078, 0.2078, 0.2078,
        0.2078, 0.2235, 0.2431, 0.2627, 0.2667, 0.2627, 0.2588, 0.2627, 0.2706,
        0.2824, 0.2706, 0.2588, 0.2627, 0.2627, 0.2431, 0.2314, 0.2353, 0.2275,
        0.2235, 0.2039, 0.2039, 0.2196, 0.2314, 0.2314, 0.2314, 0.2353, 0.2431,
        0.2667, 0.2588, 0.2588, 0.2627, 0.2667, 0.2627, 0.2510, 0.2196, 0.2235,
        0.2549, 0.2784, 0.2863, 0.2902, 0.2824, 0.2824, 0.3176, 0.3412, 0.3294,
        0.3216, 0.3333, 0.3333, 0.2941, 0.2510, 0.2275, 0.2039],
       device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       device='cuda:0')
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       device='cuda:0', grad_fn=<SelectBackward0>)
iou1  tensor(nan, device='cuda:0', grad_fn=<DivBackward0>) iou2  tensor(nan, device='cuda:0', grad_fn=<DivBackward0>)
iou1  tensor(nan, device='cuda:0', grad_fn=<DivBackward0>) iou2  tensor(nan, device='cuda:0', grad_fn=<DivBackward0>)
iou1  tensor(nan, device='cuda:0', grad_fn=<DivBackward0>) iou2  tensor(nan, device='cuda:0', grad_fn=<DivBackward0>)
iou1  tensor(nan, device='cuda:0', grad_fn=<DivBackward0>) iou2  tensor(nan, device='cuda:0', grad_fn=<DivBackward0>)
iou1  tensor(nan, device='cuda:0', grad_fn=<DivBackward0>) iou2  tensor(nan, device='cuda:0', grad_fn=<DivBackward0>)
iou1  tensor(nan, device='cuda:0', grad_fn=<DivBackward0>) iou2  tensor(nan, device='cuda:0', grad_fn=<DivBackward0>)
iou1  tensor(nan, device='cuda:0', grad_fn=<DivBackward0>) iou2  tensor(nan, device='cuda:0', grad_fn=<DivBackward0>)
iou1  tensor(nan, device='cuda:0', grad_fn=<DivBackward0>) iou2  tensor(nan, device='cuda:0', grad_fn=<DivBackward0>)
loss ------------ loss_xy_obj  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>) loss_wh_obj  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>) loss_obj_confi  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>) loss_noobj_confi  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>) loss_class  tensor(nan, device='cuda:0', grad_fn=<AddBackward0>)
epoch 0 | loss_min: 9999 | loss: nan | lr:0.01 :   0%|▉                                                                                                                                                                                            | 10/2024 [00:11<26:03,  1.29it/s]torch.Size([8, 3, 448, 448])
tensor([0.6549, 0.7216, 0.7098, 0.4941, 0.3294, 0.2745, 0.3529, 0.3843, 0.4275,
        0.4941, 0.6275, 0.6863, 0.7137, 0.7529, 0.7137, 0.6353, 0.5216, 0.4941,
        0.4863, 0.4980, 0.5020, 0.5137, 0.5373, 0.5294, 0.5333, 0.5451, 0.5569,
        0.5569, 0.5569, 0.5647, 0.5765, 0.5882, 0.6000, 0.6078, 0.6196, 0.6353,
        0.6392, 0.6392, 0.6392, 0.6471, 0.6549, 0.6588, 0.6627, 0.6627, 0.6667,
        0.6667, 0.6667, 0.6667, 0.6706, 0.6824, 0.6902, 0.6941, 0.7020, 0.7059,
        0.7137, 0.7216, 0.7216, 0.7176, 0.7137, 0.7059, 0.7020, 0.6980, 0.6784,
        0.6706, 0.6706, 0.6667, 0.6627, 0.6588, 0.6588, 0.6510, 0.6392, 0.6353,
        0.6314, 0.6314, 0.6314, 0.6314, 0.6275, 0.6196, 0.6118, 0.6039, 0.5961,
        0.6039, 0.6039, 0.6000, 0.5961, 0.6000, 0.6000, 0.6000, 0.5882, 0.5725,
        0.5490, 0.5294, 0.5333, 0.5412, 0.5373, 0.5333, 0.5294, 0.5294, 0.5333,
        0.5373, 0.5451, 0.5569, 0.5647, 0.5686, 0.5725, 0.5725, 0.5804, 0.5882,
        0.5961, 0.6078, 0.6196, 0.6314, 0.6431, 0.6549, 0.6667, 0.6745, 0.6784,
        0.6902, 0.6980, 0.7059, 0.7059, 0.7020, 0.7020, 0.7020, 0.7059, 0.7059,
        0.7098, 0.6941, 0.6902, 0.6902, 0.6902, 0.6902, 0.6902, 0.6902, 0.6902,
        0.6902, 0.6863, 0.6784, 0.6745, 0.6745, 0.6667, 0.6667, 0.6667, 0.6706,
        0.6706, 0.6745, 0.6784, 0.6824, 0.6863, 0.6902, 0.6980, 0.7020, 0.7020,
        0.7020, 0.7020, 0.7020, 0.6980, 0.6941, 0.6941, 0.6902, 0.6863, 0.6784,
        0.6745, 0.6706, 0.6706, 0.6706, 0.6667, 0.6627, 0.6627, 0.6588, 0.6510,
        0.6431, 0.6392, 0.6275, 0.6157, 0.6157, 0.6157, 0.6196, 0.6235, 0.6157,
        0.6157, 0.6157, 0.6118, 0.6118, 0.6118, 0.6078, 0.6078, 0.6118, 0.6118,
        0.6078, 0.6039, 0.6000, 0.5922, 0.5961, 0.6039, 0.6078, 0.6039, 0.6000,
        0.6078, 0.6157, 0.6235, 0.6275, 0.6314, 0.6314, 0.6353, 0.6510, 0.6627,
        0.6667, 0.6667, 0.6706, 0.6745, 0.6824, 0.6902, 0.7020, 0.7098, 0.7137,
        0.7176, 0.7176, 0.7176, 0.7294, 0.7490, 0.7333, 0.7294, 0.7333, 0.7412,
        0.7333, 0.7333, 0.7412, 0.7490, 0.7490, 0.7490, 0.7725, 0.7843, 0.7804,
        0.7725, 0.7725, 0.7765, 0.7843, 0.7843, 0.7804, 0.7725, 0.7725, 0.7725,
        0.7765, 0.7725, 0.7725, 0.7686, 0.7725, 0.7725, 0.7765, 0.7804, 0.7765,
        0.7765, 0.7804, 0.7804, 0.7804, 0.7765, 0.7765, 0.7765, 0.7725, 0.7647,
        0.7569, 0.7569, 0.7529, 0.7490, 0.7373, 0.6941, 0.7137, 0.7216, 0.7020,
        0.7294, 0.7176, 0.6667, 0.6196, 0.6471, 0.6980, 0.6784, 0.6353, 0.5922,
        0.5843, 0.5686, 0.5451, 0.5137, 0.4902, 0.4667, 0.4510, 0.4431, 0.4431,
        0.4353, 0.4196, 0.4118, 0.4078, 0.4118, 0.4118, 0.4118, 0.4196, 0.4196,
        0.4235, 0.4314, 0.4235, 0.4235, 0.4275, 0.4235, 0.4235, 0.4275, 0.4353,
        0.4392, 0.4431, 0.4510, 0.4510, 0.4549, 0.4588, 0.4745, 0.4824, 0.4863,
        0.4980, 0.5020, 0.5020, 0.5059, 0.5059, 0.5098, 0.5216, 0.5255, 0.5216,
        0.5098, 0.5098, 0.5098, 0.5098, 0.5059, 0.4980, 0.4902, 0.4784, 0.4706,
        0.4588, 0.4510, 0.4314, 0.4196, 0.4118, 0.3961, 0.3882, 0.3804, 0.3725,
        0.3686, 0.3608, 0.3529, 0.3490, 0.3412, 0.3294, 0.3373, 0.3490, 0.3608,
        0.3765, 0.3882, 0.4000, 0.4196, 0.4353, 0.4510, 0.4627, 0.4706, 0.4784,
        0.4980, 0.5059, 0.5098, 0.5098, 0.5098, 0.5098, 0.5137, 0.5059, 0.4902,
        0.4824, 0.4941, 0.4824, 0.4627, 0.4353, 0.4157, 0.4000, 0.3922, 0.3765,
        0.3608, 0.3412, 0.3333, 0.3255, 0.3176, 0.3098, 0.3020, 0.2902, 0.2863,
        0.2863, 0.2902, 0.2902, 0.2863, 0.2902, 0.2980, 0.3137, 0.3490, 0.3882,
        0.4314, 0.4745, 0.5176, 0.5569, 0.5961, 0.6314, 0.6667, 0.6902, 0.7098,
        0.7255, 0.7529, 0.7765, 0.7922, 0.8000, 0.8039, 0.8078, 0.8039, 0.8039,
        0.7961, 0.7804, 0.7608, 0.7490, 0.7412, 0.7333, 0.7216, 0.7137, 0.7059,
        0.6941, 0.6784, 0.6706, 0.6588, 0.6510, 0.6549, 0.6431, 0.6353, 0.6275,
        0.6157, 0.6157, 0.6157, 0.6078, 0.6118, 0.6157, 0.6235, 0.6314, 0.6431,
        0.6510, 0.6510, 0.6510, 0.6549, 0.6549, 0.6510, 0.6510],
       device='cuda:0')
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       device='cuda:0')
tensor([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
       device='cuda:0', grad_fn=<SelectBackward0>)
iou1  tensor(nan, device='cuda:0', grad_fn=<DivBackward0>) iou2  tensor(nan, device='cuda:0', grad_fn=<DivBackward0>)
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x000001CE90375630>
Traceback (most recent call last):
  File "C:\Users\xingshuyin\app\miniconda\envs\yolov5\lib\site-packages\torch\utils\data\dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "C:\Users\xingshuyin\app\miniconda\envs\yolov5\lib\site-packages\torch\utils\data\dataloader.py", line 1442, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "C:\Users\xingshuyin\app\miniconda\envs\yolov5\lib\multiprocessing\process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "C:\Users\xingshuyin\app\miniconda\envs\yolov5\lib\multiprocessing\popen_spawn_win32.py", line 108, in wait
    res = _winapi.WaitForSingleObject(int(self._handle), msecs)
KeyboardInterrupt:
epoch 0 | loss_min: 9999 | loss: nan | lr:0.01 :   0%|▉                                                                                                                                                                                            | 10/2024 [00:12<40:23,  1.20s/it] 
Traceback (most recent call last):
  File "c:\Users\xingshuyin\python\yolov8\yolo1.py", line 354, in <module>
  File "c:\Users\xingshuyin\python\yolov8\yolo1.py", line 311, in train
  File "C:\Users\xingshuyin\app\miniconda\envs\yolov5\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "C:\Users\xingshuyin\app\miniconda\envs\yolov5\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "c:\Users\xingshuyin\python\yolov8\yolo1.py", line 167, in forward
    iou1, iou2 = iou(pred[i, x, y], label[i, x, y])
  File "c:\Users\xingshuyin\python\yolov8\yolo1.py", line 135, in iou
    print('iou1 ', iou1, 'iou2 ', iou2)
KeyboardInterrupt

蟒蛇 pytorch nan yolo

0赞 Randall 11/18/2023

欢迎来到 StackOverflow！你的问题似乎有点让人不知所措 - 看起来你包含了完整的源代码。StackOverflow 主要用于解决有针对性的问题，而不是进行代码审查。你能为与问题无关的任何函数创建一个包含占位符代码（如 is）的表单吗？您是否知道哪个功能最有可能是问题所在 - 您能指出它的名称吗？对于输出，您能否提供您运行的原始命令？最后，请输入代码注释以显示每个回溯行引用的行号。draw_box

0赞 Randall 11/18/2023

您是否尝试过仔细查看堆栈跟踪（Traceback）中提到的每行周围的代码？

答： 暂无答案

上一个：从 Glue 导入手动上传的 pytorch 时的 OSError

下一个：为什么“torch.profiler”在与 ncu 共同运行时没有捕获 cuda 操作

尝试使用 pytorch 编写 yolo1，但在训练时得到 NaN

Trying to write yolo1 with pytorch, but get NaN when train

评论