提问人:skidjoe 提问时间:10/12/2021 更新时间:10/4/2023 访问量:2753
错误:尝试在自定义 HF 数据集上使用 trainer.train() 时,vars() 参数必须具有__dict__属性?
ERROR: vars() argument must have __dict__ attribute when trying to use trainer.train() on custom HF dataset?
问:
我有以下模型,我正在尝试微调(CLIP_ViT + 分类头)。这是我的模型定义:
class CLIPNN(nn.Module):
def __init__(self, num_labels, pretrained_name="openai/clip-vit-base-patch32", dropout=0.1):
super().__init__()
self.num_labels = num_labels
# load pre-trained transformer & processor
self.transformer = CLIPVisionModel.from_pretrained(pretrained_name)
self.processor = CLIPProcessor.from_pretrained(pretrained_name)
# initialize other layers (head after the transformer body)
self.classifier = nn.Sequential(
nn.Linear(512, 128, bias=True),
nn.ReLU(inplace=True),
nn.Dropout(p=dropout, inplace=False),
nn.Linear(128, self.num_labels, bias=True))
def forward(self, inputs, labels=None, **kwargs):
logits = self.classifier(inputs)
loss = None
if labels is not None:
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
return SequenceClassifierOutput(
loss=loss,
logits=logits,
)
我还有以下数据集定义:
class CLIPDataset(nn.utils.data.Dataset):
def __init__(self, embeddings, labels):
self.embeddings = embeddings
self.labels = labels
def __getitem__(self, idx):
item = {"embeddings": nn.Tensor(self.embeddings[idx])}
item['labels'] = nn.LongTensor([self.labels[idx]])
return item
def __len__(self):
return len(self.labels)
注意:这里我假设模型是预先计算的嵌入而不是计算嵌入,如果我想微调 CLIP 基本模型,我知道这不是正确的逻辑,我只是想让我的代码工作。
像这样的东西会抛出一个错误:
model = CLIPNN(num_labels=2)
train_data = CLIPDataset(train_data, y_train)
test_data = CLIPDataset(test_data, y_test)
trainer = Trainer(
model=model, args=training_args, train_dataset=train_data, eval_dataset=test_data
)
trainer.train()
TypeError Traceback(最近一次调用最后一次)在 ----> 1 trainer.train()
~/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/transformers/trainer.py 在火车上(自我、resume_from_checkpoint、试用、ignore_keys_for_eval、 **kwargs) 1256 self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control) 1257 → 1258 用于步进,枚举 (epoch_iterator) 中的输入: 1259 1260 # 如果恢复训练,请跳过任何已训练的步骤
~/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/torch/utils/data/dataloader.py in next(self) 515 如果 self._sampler_iter 为 None: 516 self._reset() → 517 数据 = self._next_data() 518 self._num_yielded += 1 519 如果 self._dataset_kind == _DatasetKind.Iterable 和 \
~/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/torch/utils/data/dataloader.py in _next_data(self) 555 def _next_data(self): 556 index = self._next_index() # 可能会引发 StopIteration → 557 data = self._dataset_fetcher.fetch(index) # 如果出现以下情况,可能会引发 StopIteration 558 self._pin_memory:559 个数据 = _utils.pin_memory.pin_memory(data)
~/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index) 45 else: 46 data = self.dataset[possibly_batched_index] —> 47 返回 self.collate_fn(数据)
~/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/transformers/data/data_collator.py 在 default_data_collator(features, return_tensors) 64 65 if return_tensors == “pt”: —> 66 返回 torch_default_data_collator(功能) 67 elif return_tensors == “tf”: 68 返回tf_default_data_collator(功能)
~/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/transformers/data/data_collator.py 在torch_default_data_collator(功能) 80 81 如果不是 isinstance(features[0], (dict, BatchEncoding)): —> 82 个特征 = [vars(f) for f in features] 83 first = features[0] 84 batch = {}
~/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages/transformers/data/data_collator.py in (.0) 80 81 if not isinstance(features[0], (dict, BatchEncoding)): —> 82 个特征 = [vars(f) for f in features] 83 first = features[0] 84 批处理 = {}
TypeError:vars() 参数必须具有 dict 属性
知道我做错了什么吗?
答:
您需要将 label_names 属性添加到您的 Trainer 中
trainer = 培训师( model=model, args=training_args, train_dataset=train_data, label_names=['labels'], eval_dataset=test_data )
评论
Trainer