在 kaggle 上训练 pytorch 转换器给出 RuntimeError：CUDA 错误：设备端断言触发-解网

问：

两天来我一直在寻找解决方案，但没有运气。我正在尝试训练一个转换器将英语翻译成阿拉伯语。当我使用较小的数据集大小（大约 30k 个句子对）时，它工作得很好，当我使用 60k+ 句子对时，问题变得持续存在，我不明白为什么会发生这种情况!!

我正在使用 kaggle 来训练这个转换器。

"""
torch modules
with ofc  numpy  and pandas
"""

import pandas as pd
import numpy as np 

from torch import nn
import torch
from torchtext import data
from torch.nn  import functional as F
import torch.optim as  optim 
if torch.cuda.is_available():  
  dev = "cuda:0" 

  print("gpu up")
else:  
  dev = "cpu"  
device = torch.device(dev)

import random
SEED= 32

"""
regex and the tokenizers
"""

import re
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.lang.ar import Arabic
from nltk.translate.bleu_score import sentence_bleu

enNLP = English()
arNLP = Arabic()

enTokenizer = Tokenizer(enNLP.vocab)
arTokenizer =  Tokenizer(arNLP.vocab)

df = pd.read_csv("/kaggle/input/translation-with-transformers/opus-ted.txt",encoding="utf-8",delimiter="\t\t",names=["eng","ar"])

"""
defining the tokenizers for arabic and english  

creating the fields for the dataset from torchtext 
that class is the simple way I could find for turning a df into a torch dataset

نهها and ببدأ are just arbitrary words for init and end of sentence tokens  
for some reason when I choose an arabic word for the unknown token  the vocab doesn't replace words that are not in the vocab  
"""

def myTokenizerEN(x):
 return  [word.text for word in 
          enTokenizer(re.sub(r"\s+\s+"," ",re.sub(r"[\.\'\`\"\r+\n+]"," ",x.lower())).strip())]
def myTokenizerAR(x):
 return  [word.text for word in 
          arTokenizer(re.sub(r"\s+\s+"," ",re.sub(r"[\.\'\`\"\r+\n+]"," ",x.lower())).strip())]

SRC = data.Field(tokenize=myTokenizerEN,batch_first=False,init_token="<sos>",eos_token="<eos>")
TARGET = data.Field(tokenize=myTokenizerAR,batch_first=False,tokenizer_language="ar",init_token="ببدأ",eos_token="نهها")

class DataFrameDataset(data.Dataset):

    def __init__(self, df, src_field, target_field, is_test=False, **kwargs):
        fields = [('eng', src_field), ('ar',target_field)]
        examples = []
        for i, row in df.iterrows():
            eng = row.eng 
            ar = row.ar
            examples.append(data.Example.fromlist([eng, ar], fields))

        super().__init__(examples, fields, **kwargs)

        
torchdataset = DataFrameDataset(df,SRC,TARGET)


train_data, valid_data = torchdataset.split(split_ratio=0.8, random_state = random.seed(SEED))

SRC.build_vocab(train_data,min_freq=2)
TARGET.build_vocab(train_data,min_freq=2)  


"""
we are using batches for validation and test set because of memory usage we can't pass the whole set at once

try lowering the batch size if you are out of memory 
"""
BATCH_SIZE = 64

train_iterator,valid_iterator = data.BucketIterator.splits(
    (train_data,valid_data), 
    batch_size = BATCH_SIZE,
    device = device,
    sort=False,
    sort_within_batch=False,
    shuffle=True)

#No. of unique tokens in text
src_vocab_size  = len(SRC.vocab)
print("Size of english vocabulary:",src_vocab_size)

#No. of unique tokens in label
trg_vocab_size =len(TARGET.vocab)
print("Size of arabic vocabulary:",trg_vocab_size)

num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3

max_len= 227 #227
embedding_size= 256 #256
src_pad_idx =SRC.vocab.stoi["<pad>"]


model = TranslateTransformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    max_len
).to(device)

loss_track = []
loss_validation_track= []

"""
I'm using adagrad because it assigns bigger updates to less 
frequently updated weights so thought it could be useful for 
words not used a lot.
"""

optimizer = optim.Adagrad(model.parameters(),lr = 0.003)
EPOCHS = 15

pad_idx = SRC.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx) 

for i in range(0,EPOCHS):
    stepLoss=[]
    model.train() # the training mode for the model (applies dropout and batchnorms)
    for batch  in train_iterator:
        input_sentence = batch.eng.to(device)
        trg = batch.ar.to(device)

        optimizer.zero_grad()
        out = model(input_sentence,trg[:-1])
        out = out.reshape(-1,trg_vocab_size)
        trg = trg[1:].reshape(-1)
        loss = criterion(out,trg)
        
        
        loss.backward()
        optimizer.step()
        stepLoss.append(loss.item())
        

    loss_track.append(np.mean(stepLoss))
    print("train crossentropy at epoch {} loss: ".format(i),np.mean(stepLoss))
    
    stepValidLoss=[]
    model.eval() # the evaluation mode for the model (doesn't apply dropout and batchNorm)
    for batch  in valid_iterator:
        input_sentence = batch.eng.to(device)
        trg = batch.ar.to(device)

        optimizer.zero_grad()
        out = model(input_sentence,trg[:-1])
        out = out.reshape(-1,trg_vocab_size)
        trg = trg[1:].reshape(-1)
        loss = criterion(out,trg)
        
        stepValidLoss.append(loss.item())
  
    loss_validation_track.append(np.mean(stepValidLoss))
    print("validation crossentropy at epoch {} loss: ".format(i),np.mean(stepValidLoss))

我收到此错误

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-30-c9f694cb9d66> in <module>
     25     num_decoder_layers,
     26     max_len
---> 27 ).to(device)

/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in to(self, *args, **kwargs)
    441             return t.to(device, dtype if t.is_floating_point() else None, non_blocking)
    442 
--> 443         return self._apply(convert)
    444 
    445     def register_backward_hook(self, hook):

/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    201     def _apply(self, fn):
    202         for module in self.children():
--> 203             module._apply(fn)
    204 
    205         def compute_should_use_set_data(tensor, tensor_applied):

/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    223                 # `with torch.no_grad():`
    224                 with torch.no_grad():
--> 225                     param_applied = fn(param)
    226                 should_use_set_data = compute_should_use_set_data(param, param_applied)
    227                 if should_use_set_data:

/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in convert(t)
    439             if convert_to_format is not None and t.dim() == 4:
    440                 return t.to(device, dtype if t.is_floating_point() else None, non_blocking, memory_format=convert_to_format)
--> 441             return t.to(device, dtype if t.is_floating_point() else None, non_blocking)
    442 
    443         return self._apply(convert)

RuntimeError: CUDA error: device-side assert triggered

皮火炬 Kaggle

答： 暂无答案

上一个：Kaggle 训练会话中 CNN 模型的波动精度查询

下一个：Kaggle 笔记本会话断开连接

在 kaggle 上训练 pytorch 转换器给出 RuntimeError：CUDA 错误：设备端断言触发

Training pytorch transformer on kaggle gives RuntimeError: CUDA error: device-side assert triggered

评论