现代循环神经网络实战:机器翻译

专栏:神经网络复现目录
机器翻译
机器翻译是一种利用计算机技术将一种语言的文本自动翻译成另一种语言的过程 。机器翻译技术旨在解决语言障碍问题,使不同语言之间的交流更加便捷 。机器翻译使用的算法包括统计机器翻译、神经机器翻译等 。机器翻译技术已经取得了很大的进步,但是仍然存在很多挑战,如语言差异、歧义性和多义性等 。
文章目录编码器-解码器架构 序列到序列学习() 解码器损失函数训练预测 预测序列的评估
数据集 读取数据集
本文数据集来自项目的双语句子对(英:法):
def read_data_nmt():"""载入“英语-法语”数据集"""data_dir = d2l.download_extract('fra-eng')with open(os.path.join('fra.txt'), 'r',encoding='utf-8') as f:return f.read()raw_text = read_data_nmt()print(raw_text[:75])
数据预处理 替换空格方便后续的切割
#@savedef preprocess_nmt(text):"""预处理“英语-法语”数据集"""def no_space(char, prev_char):return char in set(',.!?') and prev_char != ' '# 使用空格替换不间断空格# 使用小写字母替换大写字母text = text.replace('\u202f', ' ').replace('\xa0', ' ').lower()# 在单词和标点符号之间插入空格out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else charfor i, char in enumerate(text)]return ''.join(out)text = preprocess_nmt(raw_text)print(text[:80])
词元化
#@savedef tokenize_nmt(text, num_examples=None):"""词元化“英语-法语”数据数据集"""source, target = [], []for i, line in enumerate(text.split('\n')):if num_examples and i > num_examples:breakparts = line.split('\t')if len(parts) == 2:source.append(parts[0].split(' '))target.append(parts[1].split(' '))return source, targetsource, target = tokenize_nmt(text)source[:]
绘制图表
#@savedef show_list_len_pair_hist(legend, xlabel, ylabel, xlist, ylist):"""绘制列表长度对的直方图"""d2l.set_figsize()_, _, patches = d2l.plt.hist([[len(l) for l in xlist], [len(l) for l in ylist]])d2l.plt.xlabel(xlabel)d2l.plt.ylabel(ylabel)for patch in patches[1].patches:patch.set_hatch('/')d2l.plt.legend(legend)show_list_len_pair_hist(['source', 'target'], '# tokens per sequence','count', source, target);
定义词表
import collectionsclass Vocab:"""Vocabulary for text."""def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):"""Defined in :numref:`sec_text_preprocessing`"""if tokens is None:tokens = []if reserved_tokens is None:reserved_tokens = []# Sort according to frequenciescounter = count_corpus(tokens)self._token_freqs = sorted(counter.items(), key=lambda x: x[1],reverse=True)# The index for the unknown token is 0self.idx_to_token = [''] + reserved_tokensself.token_to_idx = {token: idxfor idx, token in enumerate(self.idx_to_token)}for token, freq in self._token_freqs:if freq < min_freq:breakif token not in self.token_to_idx:self.idx_to_token.append(token)self.token_to_idx[token] = len(self.idx_to_token) - 1def __len__(self):return len(self.idx_to_token)def __getitem__(self, tokens):if not isinstance(tokens, (list, tuple)):return self.token_to_idx.get(tokens, self.unk)return [self.__getitem__(token) for token in tokens]def to_tokens(self, indices):if not isinstance(indices, (list, tuple)):return self.idx_to_token[indices]return [self.idx_to_token[index] for index in indices]@propertydef unk(self):# Index for the unknown tokenreturn 0@propertydef token_freqs(self):# Index for the unknown tokenreturn self._token_freqsdef count_corpus(tokens):"""Count token frequencies.Defined in :numref:`sec_text_preprocessing`"""# Here `tokens` is a 1D list or 2D listif len(tokens) == 0 or isinstance(tokens[0], list):# Flatten a list of token lists into a list of tokenstokens = [token for line in tokens for token in line]return collections.Counter(tokens)