拥抱脸:无法在 kaggle 上使用 Seq2SeqTrainer 找到调谐 Whisper

Hugging face: Unable to find tune Whisper using Seq2SeqTrainer on kaggle

提问人:Xanta_Kross 提问时间:10/24/2023 更新时间:10/24/2023 访问量:48

问:

我复制了这个 kaggle 笔记本:https://www.kaggle.com/code/imtiazprio/fast-whisper-large-v2-fine-tuning-with-lora

从其输出中可以看出,它已经成功运行。但是现在我通过将其复制到我的帐户来运行相同的笔记本,它向我显示此错误。

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[16], line 2
      1 # with torch.autocast("cuda"):
----> 2 trainer.train()
      3 trainer.save_model()

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1645, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1640     self.model_wrapped = self.model
   1642 inner_training_loop = find_executable_batch_size(
   1643     self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
   1644 )
-> 1645 return inner_training_loop(
   1646     args=args,
   1647     resume_from_checkpoint=resume_from_checkpoint,
   1648     trial=trial,
   1649     ignore_keys_for_eval=ignore_keys_for_eval,
   1650 )

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1938, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   1935     self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
   1937 with self.accelerator.accumulate(model):
-> 1938     tr_loss_step = self.training_step(model, inputs)
   1940 if (
   1941     args.logging_nan_inf_filter
   1942     and not is_torch_tpu_available()
   1943     and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
   1944 ):
   1945     # if loss is nan or inf simply add the average of previous logged losses
   1946     tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2770, in Trainer.training_step(self, model, inputs)
   2768         scaled_loss.backward()
   2769 else:
-> 2770     self.accelerator.backward(loss)
   2772 return loss.detach() / self.args.gradient_accumulation_steps

File /opt/conda/lib/python3.10/site-packages/accelerate/accelerator.py:1984, in Accelerator.backward(self, loss, **kwargs)
   1982     return
   1983 elif self.scaler is not None:
-> 1984     self.scaler.scale(loss).backward(**kwargs)
   1985 else:
   1986     loss.backward(**kwargs)

File /opt/conda/lib/python3.10/site-packages/torch/_tensor.py:487, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
    477 if has_torch_function_unary(self):
    478     return handle_torch_function(
    479         Tensor.backward,
    480         (self,),
   (...)
    485         inputs=inputs,
    486     )
--> 487 torch.autograd.backward(
    488     self, gradient, retain_graph, create_graph, inputs=inputs
    489 )

File /opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py:200, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    195     retain_graph = create_graph
    197 # The reason we repeat same the comment below is that
    198 # some Python versions print out the first line of a multi-line function
    199 # calls in the traceback and some print out the last line
--> 200 Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
    201     tensors, grad_tensors_, retain_graph, create_graph, inputs,
    202     allow_unreachable=True, accumulate_grad=True)

File /opt/conda/lib/python3.10/site-packages/torch/autograd/function.py:274, in BackwardCFunction.apply(self, *args)
    270     raise RuntimeError("Implementing both 'backward' and 'vjp' for a custom "
    271                        "Function is not allowed. You should only implement one "
    272                        "of them.")
    273 user_fn = vjp_fn if vjp_fn is not Function.vjp else backward_fn
--> 274 return user_fn(self, *args)

File /opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:157, in CheckpointFunction.backward(ctx, *args)
    153 if len(outputs_with_grad) == 0:
    154     raise RuntimeError(
    155         "none of output has requires_grad=True,"
    156         " this checkpoint() is not necessary")
--> 157 torch.autograd.backward(outputs_with_grad, args_with_grad)
    158 grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else None
    159               for inp in detached_inputs)
    161 return (None, None) + grads

File /opt/conda/lib/python3.10/site-packages/torch/autograd/__init__.py:200, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    195     retain_graph = create_graph
    197 # The reason we repeat same the comment below is that
    198 # some Python versions print out the first line of a multi-line function
    199 # calls in the traceback and some print out the last line
--> 200 Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
    201     tensors, grad_tensors_, retain_graph, create_graph, inputs,
    202     allow_unreachable=True, accumulate_grad=True)

File /opt/conda/lib/python3.10/site-packages/torch/autograd/function.py:274, in BackwardCFunction.apply(self, *args)
    270     raise RuntimeError("Implementing both 'backward' and 'vjp' for a custom "
    271                        "Function is not allowed. You should only implement one "
    272                        "of them.")
    273 user_fn = vjp_fn if vjp_fn is not Function.vjp else backward_fn
--> 274 return user_fn(self, *args)

File /opt/conda/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:480, in MatMul8bitLt.backward(ctx, grad_output)
    478 elif state.CB is not None:
    479     CB = state.CB.to(ctx.dtype_A, copy=True).mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
--> 480     grad_A = torch.matmul(grad_output, CB).view(ctx.grad_shape).to(ctx.dtype_A)
    481 elif state.CxB is not None:
    482     CB = (
    483         undo_layout(state.CxB, state.tile_indices)
    484         .to(ctx.dtype_A)
    485         .mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
    486     )

RuntimeError: expected scalar type Half but found Float

请向我提供解决方案,或者 github 以外的某个地方,我可以对此提出投诉,以便解决这个问题。

Kaggle OpenAi-Whisper HuggingFace-Trainer

评论


答: 暂无答案