[Keras实战教程]·使用Transfromer模型做文本分类,NLP分类最佳模型

Transfromer理论部分

谷歌大脑在论文《Attention Is All You Need》中提出了一个完全基于注意力机制的编解码器模型 Transformer ,它完全抛弃了之前其它模型引入注意力机制后仍然保留的循环与卷积结构,然后在任务表现、并行能力和易于训练性方面都有大幅的提高。Transformer 从此也成为了机器翻译和其它许多文本理解任务中的重要基准模型。

模型具体介绍

模型论文解析

GitHub:https://github.com/xiaosongshine/transfromer_keras

Transfromer模型代码实现(基于Keras)

Position_Embedding

#! -*- coding: utf-8 -*-

#%%

from __future__ import print_function

from keras import backend as K

from keras.engine.topology import Layer

class Position_Embedding(Layer):

def __init__(self, size=None, mode=\'sum\', **kwargs):

self.size = size #必须为偶数

self.mode = mode

super(Position_Embedding, self).__init__(**kwargs)

def call(self, x):

if (self.size == None) or (self.mode == \'sum\'):

self.size = int(x.shape[-1])

batch_size,seq_len = K.shape(x)[0],K.shape(x)[1]

position_j = 1. / K.pow(10000., \

2 * K.arange(self.size / 2, dtype=\'float32\' \

) / self.size)

position_j = K.expand_dims(position_j, 0)

position_i = K.cumsum(K.ones_like(x[:,:,0]), 1)-1 #K.arange不支持变长,只好用这种方法生成

position_i = K.expand_dims(position_i, 2)

position_ij = K.dot(position_i, position_j)

position_ij = K.concatenate([K.cos(position_ij), K.sin(position_ij)], 2)

if self.mode == \'sum\':

return position_ij + x

elif self.mode == \'concat\':

return K.concatenate([position_ij, x], 2)

def compute_output_shape(self, input_shape):

if self.mode == \'sum\':

return input_shape

elif self.mode == \'concat\':

return (input_shape[0], input_shape[1], input_shape[2]+self.size)

Attention

class Attention(Layer):

def __init__(self, nb_head, size_per_head, **kwargs):

self.nb_head = nb_head

self.size_per_head = size_per_head

self.output_dim = nb_head*size_per_head

super(Attention, self).__init__(**kwargs)

def build(self, input_shape):

self.WQ = self.add_weight(name=\'WQ\',

shape=(input_shape[0][-1], self.output_dim),

initializer=\'glorot_uniform\',

trainable=True)

self.WK = self.add_weight(name=\'WK\',

shape=(input_shape[1][-1], self.output_dim),

initializer=\'glorot_uniform\',

trainable=True)

self.WV = self.add_weight(name=\'WV\',

shape=(input_shape[2][-1], self.output_dim),

initializer=\'glorot_uniform\',

trainable=True)

super(Attention, self).build(input_shape)

def Mask(self, inputs, seq_len, mode=\'mul\'):

if seq_len == None:

return inputs

else:

mask = K.one_hot(seq_len[:,0], K.shape(inputs)[1])

mask = 1 - K.cumsum(mask, 1)

for _ in range(len(inputs.shape)-2):

mask = K.expand_dims(mask, 2)

if mode == \'mul\':

return inputs * mask

if mode == \'add\':

return inputs - (1 - mask) * 1e12

def call(self, x):

#如果只传入Q_seq,K_seq,V_seq,那么就不做Mask

#如果同时传入Q_seq,K_seq,V_seq,Q_len,V_len,那么对多余部分做Mask

if len(x) == 3:

Q_seq,K_seq,V_seq = x

Q_len,V_len = None,None

elif len(x) == 5:

Q_seq,K_seq,V_seq,Q_len,V_len = x

#对Q、K、V做线性变换

Q_seq = K.dot(Q_seq, self.WQ)

Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head))

Q_seq = K.permute_dimensions(Q_seq, (0,2,1,3))

K_seq = K.dot(K_seq, self.WK)

K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head))

K_seq = K.permute_dimensions(K_seq, (0,2,1,3))

V_seq = K.dot(V_seq, self.WV)

V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head))

V_seq = K.permute_dimensions(V_seq, (0,2,1,3))

#计算内积,然后mask,然后softmax

A = K.batch_dot(Q_seq, K_seq, axes=[3,3]) / self.size_per_head**0.5

A = K.permute_dimensions(A, (0,3,2,1))

A = self.Mask(A, V_len, \'add\')

A = K.permute_dimensions(A, (0,3,2,1))

A = K.softmax(A)

#输出并mask

O_seq = K.batch_dot(A, V_seq, axes=[3,2])

O_seq = K.permute_dimensions(O_seq, (0,2,1,3))

O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim))

O_seq = self.Mask(O_seq, Q_len, \'mul\')

return O_seq

def compute_output_shape(self, input_shape):

return (input_shape[0][0], input_shape[0][1], self.output_dim)

将上述两段代码保存到 Attention_keras.py

训练模型

引入包,记载文本数据

#%%

from keras.preprocessing import sequence

from keras.datasets import imdb

from matplotlib import pyplot as plt

import pandas as pd

max_features = 20000

print(\'Loading data...\')

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

#标签转换为独热码

y_train, y_test = pd.get_dummies(y_train),pd.get_dummies(y_test)

print(len(x_train), \'train sequences\')

print(len(x_test), \'test sequences\')

输出:(如果第一次运行,将下载文件,我下载过现在运行就直接加载了)

Using TensorFlow backend.

Loading data...

25000 train sequences

25000 test sequences

数据归一化处理

#%%数据归一化处理

maxlen = 64

print(\'Pad sequences (samples x time)\')

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)

x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

print(\'x_train shape:\', x_train.shape)

print(\'x_test shape:\', x_test.shape)

输出结果(pad_sequences作用,长度大于maxlen部分会被截取的,小于maxlen会填充到maxlen长度)

Pad sequences (samples x time)

x_train shape: (25000, 64)

x_test shape: (25000, 64)

定义网络模型

batch_size = 5

from keras.models import Model

from keras.optimizers import SGD,Adam

from keras.layers import *

S_inputs = Input(shape=(None,), dtype=\'int32\')

embeddings = Embedding(max_features, 128)(S_inputs)

embeddings = Position_Embedding()(embeddings) #增加Position_Embedding能轻微提高准确率

O_seq = Attention(8,16)([embeddings,embeddings,embeddings])

O_seq = GlobalAveragePooling1D()(O_seq)

O_seq = Dropout(0.5)(O_seq)

outputs = Dense(2, activation=\'softmax\')(O_seq)

model = Model(inputs=S_inputs, outputs=outputs)

# try using different optimizers and different optimizer configs

opt = Adam(lr=0.0005)

loss = \'categorical_crossentropy\'

model.compile(loss=loss,

optimizer=opt,

metrics=[\'accuracy\'])

print(model.summary(http://www.my516.com))

模型输出(模型很简单,参数量较少)

==================================================================================================

input_1 (InputLayer) (None, None) 0

__________________________________________________________________________________________________

embedding_1 (Embedding) (None, None, 128) 2560000 input_1[0][0]

__________________________________________________________________________________________________

position__embedding_1 (Position (None, None, 128) 0 embedding_1[0][0]

__________________________________________________________________________________________________

attention_1 (Attention) (None, None, 128) 49152 position__embedding_1[0][0]

position__embedding_1[0][0]

position__embedding_1[0][0]

__________________________________________________________________________________________________

global_average_pooling1d_1 (Glo (None, 128) 0 attention_1[0][0]

__________________________________________________________________________________________________

dropout_1 (Dropout) (None, 128) 0 global_average_pooling1d_1[0][0]

__________________________________________________________________________________________________

dense_1 (Dense) (None, 2) 258 dropout_1[0][0]

==================================================================================================

Total params: 2,609,410

Trainable params: 2,609,410

Non-trainable params: 0

__________________________________________________________________________________________________

训练,保存模型

#%%

print(\'Train...\')

model.fit(x_train, y_train,

batch_size=batch_size,

epochs=2,

validation_data=(x_test, y_test))

model.save("imdb.h5")

输出:(训练2个epochs,就可以达到80%多准确率,模型优异)

Train...

Train on 25000 samples, validate on 25000 samples

Epoch 1/2

25000/25000 [==============================] - 95s 4ms/step - loss: 0.4826 - acc: 0.7499 - val_loss: 0.3663 - val_acc: 0.8353

Epoch 2/2

25000/25000 [==============================] - 93s 4ms/step - loss: 0.3084 - acc: 0.8680 - val_loss: 0.3983 - val_acc: 0.8163

将上述代码保存到 train.py

---------------------