如何在customdatagenerator中获取正确的confusion_matrix数据

How to get correct confusion_matrix data in customdatagenerator

提问人:Syuuuu 提问时间:8/8/2023 更新时间:8/9/2023 访问量:26

问:

我正在建造confusion_matrix,但我总是返回错误的形状y_true

我认为我的y_label是正确的,我有 62 个值数据

我不知道y_true应该在何处申报y_true

ValueError
Found input variables with inconsistent numbers of samples: [63, 62]
  File "C:\Labbb\inceptionResnetV2\InceptionResnetV2_1.py", line 213, in <module>
    sns.heatmap(confusion_matrix(y_true, y_pred),
ValueError: Found input variables with inconsistent numbers of samples: [63, 62]

我尝试在get_data中附加self.y_true,使用 def get_y_true返回self.y_true,并在 on_epoch_end 中使用“self.y_true = []”,shuffle=False。

这里是 CustomDataGenerator。

我应该在哪里声明“self.y_true = []”

train_dir = r'C:\Labbb\mergeimage_npy\512512\npy\train'
valid_dir = r'C:\Labbb\mergeimage_npy\512512\npy\val'
image_folders = ['image0', 'image1', 'image2', 'image3', 'image4', 'image6', 'image7']  
label_folders = ['label0', 'label1', 'label2', 'label3', 'label4', 'label6', 'label7']  
class CustomDataGenerator(Sequence):
    def __init__(self, image_folders, label_folders, dir, dim=(512,512),  batch_size=1,n_classes=7,n_channels=8,shuffle=True):
        self.image_folders = image_folders
        ...
        self.image_paths = []
        self.label_paths = []
        self.on_epoch_end()
    def __len__(self):
        return int(np.ceil(len(self.image_paths) / self.batch_size))  
    def __getitem__(self, index):
        batch_image_paths = self.image_paths[index * self.batch_size: (index + 1) * self.batch_size]
        batch_label_paths = self.label_paths[index * self.batch_size: (index + 1) * self.batch_size]
        batch = zip(batch_image_paths, batch_label_paths)
        return self.get_data(batch)
    def on_epoch_end(self):
        self.image_paths = []
        self.label_paths = []
        
        self.y_true = []
        for folder in self.image_folders:
            image_folder_path = os.path.join(self.dir, folder)
            image_files = os.listdir(image_folder_path)
            for file_name in image_files:
                self.image_paths.append(os.path.join(image_folder_path, file_name))
        for folder in self.label_folders:
            ...
                
        if self.shuffle:
            np.random.shuffle(self.image_paths)
            np.random.shuffle(self.label_paths)
    def get_data(self, batch):
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size, self.n_classes))
        
        for i, (image_path, label_path) in enumerate(batch):
            image = np.load(image_path)
            with open(label_path, 'r') as f:
                line = f.readline().strip()
                filepath, label = line.rsplit(' ', 1)
                label = int(label)
                self.y_true.append(label)
            label_one_hot = to_categorical(label, num_classes=self.n_classes)

            X[i,] = image
            y[i,] = label_one_hot
            
        return X, y
    
    def get_y_true(self):
        return self.y_true

这是获取y_true和y_pred,并构建confusion_matrix

在这里,“y_true = val_datagen.get_y_true()”应该放在这一行“Y_pred = model.predict”之前还是之后?

train_datagen = CustomDataGenerator(image_folders, label_folders, train_dir, **params, shuffle = True)
val_datagen = CustomDataGenerator(image_folders, label_folders, valid_dir, **params, shuffle = False)

y_true = val_datagen.get_y_true()
Y_pred = model.predict(val_datagen)
y_pred = np.argmax(Y_pred, axis=1) 
fig, ax = plt.subplots(figsize=(12,6))  
sns.heatmap(confusion_matrix(y_true, y_pred),annot=True, fmt="d", cmap='Greens',ax = ax)
python tensorflow keras 混淆矩阵

评论

0赞 mhenning 8/9/2023
你好,又来了。您想从一个批次还是从整个val_datagen数据集中y_true?
0赞 Syuuuu 8/9/2023
@mhenning我想从整个val_datagen数据集中获取y_true,我尝试调整“self.y_true = []”位置,但总是失败。谢谢!

答:

0赞 mhenning 8/9/2023 #1

我无法测试它,但它现在应该可以工作了。我将 from 移至 ,只有在调用第一批时才会重置它。在纪元开始时回调会更好。这仅在数据集至少被调用一次后才起作用,因为图像和标签是在 上一个批次加载的。
但我不确定批量大小为 1 的加载了多少图像。似乎一批获得了图像文件夹路径,该文件夹中是否有多个图像?
self.y_true=[]on_epoch_end()__get_item__()get_data()

train_dir = r'C:\Labbb\mergeimage_npy\512512\npy\train'
valid_dir = r'C:\Labbb\mergeimage_npy\512512\npy\val'
image_folders = ['image0', 'image1', 'image2', 'image3', 'image4', 'image6', 'image7']  
label_folders = ['label0', 'label1', 'label2', 'label3', 'label4', 'label6', 'label7']  

class CustomDataGenerator(Sequence):
    def __init__(self, image_folders, label_folders, dir, dim=(512,512),  batch_size=1,n_classes=7,n_channels=8,shuffle=True):
        self.image_folders = image_folders
        ...
        self.image_paths = []
        self.label_paths = []
        self.init_paths()
    
    def __len__(self):
        return int(np.ceil(len(self.image_paths) / self.batch_size))  
    
    def __getitem__(self, index):
        if index == 0:  # this line here should fix it
            self.y_true = []
        batch_image_paths = self.image_paths[index * self.batch_size: (index + 1) * self.batch_size]
        batch_label_paths = self.label_paths[index * self.batch_size: (index + 1) * self.batch_size]
        batch = zip(batch_image_paths, batch_label_paths)
        return self.get_data(batch)

    def init_paths(self):
        for folder in self.image_folders:
            image_folder_path = os.path.join(self.dir, folder)
            image_files = os.listdir(image_folder_path)
            for file_name in image_files:
                self.image_paths.append(os.path.join(image_folder_path, file_name))
        for folder in self.label_folders:
            ...
        if self.shuffle:
            np.random.shuffle(self.image_paths)
            np.random.shuffle(self.label_paths)
                
    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.image_paths)
            np.random.shuffle(self.label_paths)

    def get_data(self, batch):
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size, self.n_classes))
        
        for i, (image_path, label_path) in enumerate(batch):
            image = np.load(image_path)
            with open(label_path, 'r') as f:
                line = f.readline().strip()
                filepath, label = line.rsplit(' ', 1)
                label = int(label)
                self.y_true.append(label)
            label_one_hot = to_categorical(label, num_classes=self.n_classes)

            X[i,] = image
            y[i,] = label_one_hot
            
        return X, y
    
    def get_y_true(self):
        return self.y_true