提问人:Abdalnassir Ghzawi 提问时间:11/9/2023 更新时间:11/9/2023 访问量:11
在 kaggle 上训练 pytorch 转换器给出 RuntimeError:CUDA 错误:设备端断言触发
Training pytorch transformer on kaggle gives RuntimeError: CUDA error: device-side assert triggered
问:
两天来我一直在寻找解决方案,但没有运气。我正在尝试训练一个转换器将英语翻译成阿拉伯语。当我使用较小的数据集大小(大约 30k 个句子对)时,它工作得很好,当我使用 60k+ 句子对时,问题变得持续存在,我不明白为什么会发生这种情况!!
我正在使用 kaggle 来训练这个转换器。
"""
torch modules
with ofc numpy and pandas
"""
import pandas as pd
import numpy as np
from torch import nn
import torch
from torchtext import data
from torch.nn import functional as F
import torch.optim as optim
if torch.cuda.is_available():
dev = "cuda:0"
print("gpu up")
else:
dev = "cpu"
device = torch.device(dev)
import random
SEED= 32
"""
regex and the tokenizers
"""
import re
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.lang.ar import Arabic
from nltk.translate.bleu_score import sentence_bleu
enNLP = English()
arNLP = Arabic()
enTokenizer = Tokenizer(enNLP.vocab)
arTokenizer = Tokenizer(arNLP.vocab)
df = pd.read_csv("/kaggle/input/translation-with-transformers/opus-ted.txt",encoding="utf-8",delimiter="\t\t",names=["eng","ar"])
"""
defining the tokenizers for arabic and english
creating the fields for the dataset from torchtext
that class is the simple way I could find for turning a df into a torch dataset
نهها and ببدأ are just arbitrary words for init and end of sentence tokens
for some reason when I choose an arabic word for the unknown token the vocab doesn't replace words that are not in the vocab
"""
def myTokenizerEN(x):
return [word.text for word in
enTokenizer(re.sub(r"\s+\s+"," ",re.sub(r"[\.\'\`\"\r+\n+]"," ",x.lower())).strip())]
def myTokenizerAR(x):
return [word.text for word in
arTokenizer(re.sub(r"\s+\s+"," ",re.sub(r"[\.\'\`\"\r+\n+]"," ",x.lower())).strip())]
SRC = data.Field(tokenize=myTokenizerEN,batch_first=False,init_token="<sos>",eos_token="<eos>")
TARGET = data.Field(tokenize=myTokenizerAR,batch_first=False,tokenizer_language="ar",init_token="ببدأ",eos_token="نهها")
class DataFrameDataset(data.Dataset):
def __init__(self, df, src_field, target_field, is_test=False, **kwargs):
fields = [('eng', src_field), ('ar',target_field)]
examples = []
for i, row in df.iterrows():
eng = row.eng
ar = row.ar
examples.append(data.Example.fromlist([eng, ar], fields))
super().__init__(examples, fields, **kwargs)
torchdataset = DataFrameDataset(df,SRC,TARGET)
train_data, valid_data = torchdataset.split(split_ratio=0.8, random_state = random.seed(SEED))
SRC.build_vocab(train_data,min_freq=2)
TARGET.build_vocab(train_data,min_freq=2)
"""
we are using batches for validation and test set because of memory usage we can't pass the whole set at once
try lowering the batch size if you are out of memory
"""
BATCH_SIZE = 64
train_iterator,valid_iterator = data.BucketIterator.splits(
(train_data,valid_data),
batch_size = BATCH_SIZE,
device = device,
sort=False,
sort_within_batch=False,
shuffle=True)
#No. of unique tokens in text
src_vocab_size = len(SRC.vocab)
print("Size of english vocabulary:",src_vocab_size)
#No. of unique tokens in label
trg_vocab_size =len(TARGET.vocab)
print("Size of arabic vocabulary:",trg_vocab_size)
num_heads = 8
num_encoder_layers = 3
num_decoder_layers = 3
max_len= 227 #227
embedding_size= 256 #256
src_pad_idx =SRC.vocab.stoi["<pad>"]
model = TranslateTransformer(
embedding_size,
src_vocab_size,
trg_vocab_size,
src_pad_idx,
num_heads,
num_encoder_layers,
num_decoder_layers,
max_len
).to(device)
loss_track = []
loss_validation_track= []
"""
I'm using adagrad because it assigns bigger updates to less
frequently updated weights so thought it could be useful for
words not used a lot.
"""
optimizer = optim.Adagrad(model.parameters(),lr = 0.003)
EPOCHS = 15
pad_idx = SRC.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
for i in range(0,EPOCHS):
stepLoss=[]
model.train() # the training mode for the model (applies dropout and batchnorms)
for batch in train_iterator:
input_sentence = batch.eng.to(device)
trg = batch.ar.to(device)
optimizer.zero_grad()
out = model(input_sentence,trg[:-1])
out = out.reshape(-1,trg_vocab_size)
trg = trg[1:].reshape(-1)
loss = criterion(out,trg)
loss.backward()
optimizer.step()
stepLoss.append(loss.item())
loss_track.append(np.mean(stepLoss))
print("train crossentropy at epoch {} loss: ".format(i),np.mean(stepLoss))
stepValidLoss=[]
model.eval() # the evaluation mode for the model (doesn't apply dropout and batchNorm)
for batch in valid_iterator:
input_sentence = batch.eng.to(device)
trg = batch.ar.to(device)
optimizer.zero_grad()
out = model(input_sentence,trg[:-1])
out = out.reshape(-1,trg_vocab_size)
trg = trg[1:].reshape(-1)
loss = criterion(out,trg)
stepValidLoss.append(loss.item())
loss_validation_track.append(np.mean(stepValidLoss))
print("validation crossentropy at epoch {} loss: ".format(i),np.mean(stepValidLoss))
我收到此错误
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-30-c9f694cb9d66> in <module>
25 num_decoder_layers,
26 max_len
---> 27 ).to(device)
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in to(self, *args, **kwargs)
441 return t.to(device, dtype if t.is_floating_point() else None, non_blocking)
442
--> 443 return self._apply(convert)
444
445 def register_backward_hook(self, hook):
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _apply(self, fn)
201 def _apply(self, fn):
202 for module in self.children():
--> 203 module._apply(fn)
204
205 def compute_should_use_set_data(tensor, tensor_applied):
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in _apply(self, fn)
223 # `with torch.no_grad():`
224 with torch.no_grad():
--> 225 param_applied = fn(param)
226 should_use_set_data = compute_should_use_set_data(param, param_applied)
227 if should_use_set_data:
/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py in convert(t)
439 if convert_to_format is not None and t.dim() == 4:
440 return t.to(device, dtype if t.is_floating_point() else None, non_blocking, memory_format=convert_to_format)
--> 441 return t.to(device, dtype if t.is_floating_point() else None, non_blocking)
442
443 return self._apply(convert)
RuntimeError: CUDA error: device-side assert triggered
答: 暂无答案
评论