I'm trying to implement English to Hindi translation using Deep Learning LSTM. But when I train the model it shows 'nan' loss in both actual and validation.
Link of text file containing translation pairs-: http://www.manythings.org/anki/
Below is my Jupyter notebook code-:
import string
import re
from numpy import array, argmax, random, take, delete
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Bidirectional, RepeatVector, TimeDistributed
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import matplotlib.pyplot as plt
# function to read raw text file
def read_text(filename):
# open the file
file = open(filename, mode='rt', encoding='utf-8')
# read all text
text = file.read()
file.close()
return text
# split a text into sentences
def to_lines(text):
sents = text.strip().split('\n')
sents = [i.split('\t') for i in sents]
return sents
data = read_text("/content/drive/My Drive/Colab Notebooks/Language Translator New/hin.txt")
eng_hin = to_lines(data)
eng_hin = array(eng_hin)
eng_hin = delete(eng_hin, 2, axis=1)
# Remove punctuation
eng_hin[:,0] = [s.translate(str.maketrans('', '', string.punctuation)) for s in eng_hin[:,0]]
eng_hin[:,1] = [s.translate(str.maketrans('', '', string.punctuation)) for s in eng_hin[:,1]]
# convert to lowercase
for i in range(len(eng_hin)):
eng_hin[i,0] = eng_hin[i,0].lower()
eng_hin[i,1] = eng_hin[i,1].lower()
# empty lists
eng_l = []
hin_l = []
# populate the lists with sentence lengths
for i in eng_hin[:,0]:
eng_l.append(len(i.split()))
for i in eng_hin[:,1]:
hin_l.append(len(i.split()))
print(max(eng_l))
print(max(hin_l))
# function to build a tokenizer
def tokenization(lines):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
return tokenizer
# prepare english tokenizer
eng_tokenizer = tokenization(eng_hin[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = 25
print('English Vocabulary Size: %d' % eng_vocab_size)
# prepare Hindi tokenizer
hin_tokenizer = tokenization(eng_hin[:, 1])
hin_vocab_size = len(hin_tokenizer.word_index) + 1
hin_length = 25
print('Hindi Vocabulary Size: %d' % hin_vocab_size)
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
# integer encode sequences
seq = tokenizer.texts_to_sequences(lines)
# pad sequences with 0 values
seq = pad_sequences(seq, maxlen=length, padding='post')
return seq
# Model Building
from sklearn.model_selection import train_test_split
train, test = train_test_split(eng_hin, test_size=0.2, random_state = 12)
# prepare training data
trainX = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_sequences(hin_tokenizer, hin_length, train[:, 1])
# prepare validation data
testX = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_sequences(hin_tokenizer, hin_length, test[:, 1])
# build NMT model
def build_model(in_vocab, out_vocab, in_timesteps, out_timesteps, units):
model = Sequential()
model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
model.add(LSTM(units))
model.add(RepeatVector(out_timesteps))
model.add(LSTM(units, return_sequences=True))
model.add(Dense(out_vocab, activation='softmax'))
return model
model = build_model(hin_vocab_size, eng_vocab_size, hin_length, eng_length, 512)
rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
filename = '/content/drive/My Drive/Colab Notebooks/Language Translator New/Englis_Hindi_Checkpoints/model.h1.31_dec_19'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1),
epochs=100, batch_size=64,
validation_split = 0.2,
callbacks=[checkpoint], verbose=1)
model.save('/content/drive/My Drive/Colab Notebooks/Language Translator New/Englis_Hindi_Checkpoints/eng2hin.h5')
When I try to fit the model, it runs but shows 'nan' in loss. Please help me to resolve my issue.