import pandas as pd
import numpy as np
import cv2
from torch.utils.data.dataset import Dataset
class CustomDatasetFromCSV(Dataset):
def __init__(self, csv_path, transform=None):
self.data = pd.read_csv(csv_path)
self.labels = pd.get_dummies(self.data['emotion']).as_matrix()
self.height = 48
self.width = 48
self.transform = transform
def __getitem__(self, index):
pixels = self.data['pixels'].tolist()
faces = []
for pixel_sequence in pixels:
face = [int(pixel) for pixel in pixel_sequence.split(' ')]
# print(np.asarray(face).shape)
face = np.asarray(face).reshape(self.width, self.height)
face = cv2.resize(face.astype('uint8'), (self.width, self.height))
faces.append(face.astype('float32'))
faces = np.asarray(faces)
faces = np.expand_dims(faces, -1)
return faces, self.labels
def __len__(self):
return len(self.data)
这是我可以通过使用来自其他存储库的引用来做到的。但是,我想将此数据集拆分为训练和测试。
我怎么能在这堂课内做到这一点?还是我需要单独开设一个班级来做到这一点?
从 PyTorch 0.4.1 开始,您可以使用 random_split
:
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])
使用 Pytorch 的 SubsetRandomSampler
:
import torch
import numpy as np
from torchvision import datasets
from torchvision import transforms
from torch.utils.data.sampler import SubsetRandomSampler
class CustomDatasetFromCSV(Dataset):
def __init__(self, csv_path, transform=None):
self.data = pd.read_csv(csv_path)
self.labels = pd.get_dummies(self.data['emotion']).as_matrix()
self.height = 48
self.width = 48
self.transform = transform
def __getitem__(self, index):
# This method should return only 1 sample and label
# (according to "index"), not the whole dataset
# So probably something like this for you:
pixel_sequence = self.data['pixels'][index]
face = [int(pixel) for pixel in pixel_sequence.split(' ')]
face = np.asarray(face).reshape(self.width, self.height)
face = cv2.resize(face.astype('uint8'), (self.width, self.height))
label = self.labels[index]
return face, label
def __len__(self):
return len(self.labels)
dataset = CustomDatasetFromCSV(my_path)
batch_size = 16
validation_split = .2
shuffle_dataset = True
random_seed= 42
# Creating data indices for training and validation splits:
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
np.random.seed(random_seed)
np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]
# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
sampler=valid_sampler)
# Usage Example:
num_epochs = 10
for epoch in range(num_epochs):
# Train:
for batch_index, (faces, labels) in enumerate(train_loader):
# ...
dataset_size
)。
forward
会获取输入数据。该数据的形状是 5D 张量 - (32L, 35887L, 48L, 48L, 1L)
。 32 是批量大小,接下来是数据集的长度,然后是图像的高度、宽度和通道。
Dataset.__getitem__()
应返回单个样本和标签,而不是整个数据集。我编辑了我的帖子,给你一个例子,它应该是什么样子。
batch_size
定义堆叠在一起的样本数量,并在每次训练迭代中传递给神经网络的 mini-batch。有关详细信息,请参阅 Dataloader documentation 或此 Cross-Validated thread。
当前答案进行随机拆分,其缺点是不能保证每类的样本数量是平衡的。当您希望每个类有少量样本时,这尤其成问题。例如,MNIST 有 60,000 个示例,即每个数字 6000 个。假设您只需要训练集中每个数字 30 个示例。在这种情况下,随机拆分可能会在类之间产生不平衡(一位数的训练数据比其他数多)。因此,您要确保每个数字精确地只有 30 个标签。这称为分层抽样。
一种方法是在 Pytorch 和 sample code is here 中使用采样器接口。
另一种方法就是破解你的方式:)。例如,下面是 MNIST 的简单实现,其中 ds
是 MNIST 数据集,k
是每个类所需的样本数。
def sampleFromClass(ds, k):
class_counts = {}
train_data = []
train_label = []
test_data = []
test_label = []
for data, label in ds:
c = label.item()
class_counts[c] = class_counts.get(c, 0) + 1
if class_counts[c] <= k:
train_data.append(data)
train_label.append(torch.unsqueeze(label, 0))
else:
test_data.append(data)
test_label.append(torch.unsqueeze(label, 0))
train_data = torch.cat(train_data)
for ll in train_label:
print(ll)
train_label = torch.cat(train_label)
test_data = torch.cat(test_data)
test_label = torch.cat(test_label)
return (TensorDataset(train_data, train_label),
TensorDataset(test_data, test_label))
你可以像这样使用这个函数:
def main():
train_ds = datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor()
]))
train_ds, test_ds = sampleFromClass(train_ds, 3)
如果您想确保您的分组具有平衡的类,您可以使用 sklearn
中的 train_test_split
。
假设您已将 data
包装在 custom Dataset object 中:
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split
TEST_SIZE = 0.1
BATCH_SIZE = 64
SEED = 42
# generate indices: instead of the actual data we pass in integers instead
train_indices, test_indices, _, _ = train_test_split(
range(len(data)),
data.targets,
stratify=data.targets,
test_size=TEST_SIZE,
random_state=SEED
)
# generate subset based on indices
train_split = Subset(data, train_indices)
test_split = Subset(data, test_indices)
# create batches
train_batches = DataLoader(train_split, batch_size=BATCH_SIZE, shuffle=True)
test_batches = DataLoader(test_split, batch_size=BATCH_SIZE)
这是带有 random_split
方法的 PyTorch Subset
类。请注意,此方法是 SubsetRandomSampler
的基础。
https://i.stack.imgur.com/K9D0z.png
对于 MNIST,如果我们使用 random_split
:
loader = DataLoader(
torchvision.datasets.MNIST('/data/mnist', train=True, download=True,
transform=torchvision.transforms.Compose([
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(
(0.5,), (0.5,))
])),
batch_size=16, shuffle=False)
print(loader.dataset.data.shape)
test_ds, valid_ds = torch.utils.data.random_split(loader.dataset, (50000, 10000))
print(test_ds, valid_ds)
print(test_ds.indices, valid_ds.indices)
print(test_ds.indices.shape, valid_ds.indices.shape)
我们得到:
torch.Size([60000, 28, 28])
<torch.utils.data.dataset.Subset object at 0x0000020FD1880B00> <torch.utils.data.dataset.Subset object at 0x0000020FD1880C50>
tensor([ 1520, 4155, 45472, ..., 37969, 45782, 34080]) tensor([ 9133, 51600, 22067, ..., 3950, 37306, 31400])
torch.Size([50000]) torch.Size([10000])
我们的 test_ds.indices
和 valid_ds.indices
将在范围 (0, 600000)
中随机出现。但是,如果我想从 (0, 49999)
和 (50000, 59999)
获取索引序列,我目前无法做到这一点,除了 this 方式。
在您运行 the MNIST benchmark 的情况下很方便,其中预定义了测试数据集和验证数据集。
请记住,大多数典型的例子已经受到了抨击。例如,在 this page,您会找到 MNIST。一种普遍的看法是它有 60.000 张图像。砰!错误的!它有 60.000 个训练图像和 10.000 个验证(测试)图像中的 70.000 个图像。
因此,对于规范数据集,PyTorch 的风格是为您提供已经受到攻击的数据集。
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.optim import *
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import os
import numpy as np
import random
bs=512
t = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=(0), std=(1))]
)
dl_train = DataLoader( torchvision.datasets.MNIST('/data/mnist', download=True, train=True, transform=t),
batch_size=bs, drop_last=True, shuffle=True)
dl_valid = DataLoader( torchvision.datasets.MNIST('/data/mnist', download=True, train=False, transform=t),
batch_size=bs, drop_last=True, shuffle=True)
如果您希望训练数据集中每个类别最多 X 个样本,您可以使用以下代码:
def stratify_split(dataset: Dataset, train_samples_per_class: int):
import collections
train_indices = []
val_indices = []
TRAIN_SAMPLES_PER_CLASS = 10
target_counter = collections.Counter()
for idx, data in enumerate(dataset):
target = data['target']
target_counter[target] += 1
if target_counter[target] <= train_samples_per_class:
train_indices.append(idx)
else:
val_indices.append(idx)
train_dataset = Subset(dataset, train_indices)
val_dataset = Subset(dataset, val_indices)
return train_dataset, val_dataset
不定期副业成功案例分享
train_loader
stackoverflow.com/questions/53916594/… 时遇到了这个问题AttributeError: 'Subset' object has no attribute 'targets'
如何仅访问其中一个子集的目标?我想分别为训练和测试数据打印这样的内容{0: 111, 1: 722, 2: 813, 3: 175, 4: 283, 5: 2846, 6: 290, 7: 106}
TypeError 'DataLoader' object is not subscriptable
,您可能还想查看 stackoverflow.com/a/60150673/12068941