xlnet建模函数
查找对应词的嵌入向量
def embedding_lookup(x, n_token, d_embed, initializer, use_tpu=True,
scope='embedding', reuse=None, dtype=tf.float32):
"""TPU and GPU embedding_lookup function."""
with tf.variable_scope(scope, reuse=reuse):
lookup_table = tf.get_variable('lookup_table', [n_token, d_embed],
dtype=dtype, initializer=initializer)
if use_tpu:
one_hot_idx = tf.one_hot(x, n_token, dtype=dtype)
if one_hot_idx.shape.ndims == 2:
return tf.einsum('in,nd->id', one_hot_idx, lookup_table), lookup_table
else:
return tf.einsum('ibn,nd->ibd', one_hot_idx, lookup_table), lookup_table
else:
return tf.nn.embedding_lookup(lookup_table, x), lookup_table
获得位置嵌入向量
def positional_embedding(pos_seq, inv_freq, bsz=None):
sinusoid_inp = tf.einsum('i,d->id', pos_seq, inv_freq)
pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
pos_emb = pos_emb[:, None, :]
if bsz is not None:
pos_emb = tf.tile(pos_emb, [1, bsz, 1])
return pos_emb
全连接层->dropout层-> 全连接层->dropout层->残差连接
def positionwise_ffn(inp, d_model, d_inner, dropout, kernel_initializer,
activation_type='relu', scope='ff', is_training=True,
reuse=None):
"""Position-wise Feed-forward Network."""
if activation_type == 'relu':
activation = tf.nn.relu
elif activation_type == 'gelu':
activation = gelu
else:
raise ValueError('Unsupported activation type {}'.format(activation_type))
output = inp
with tf.variable_scope(scope, reuse=reuse):
output = tf.layers.dense(output, d_inner, activation=activation,
kernel_initializer=kernel_initializer,
name='layer_1')
output = tf.layers.dropout(output, dropout, training=is_training,
name='drop_1')
output = tf.layers.dense(output, d_model,
kernel_initializer=kernel_initializer,
name='layer_2')
output = tf.layers.dropout(output, dropout, training=is_training,
name='drop_2')
output = tf.contrib.layers.layer_norm(output + inp, begin_norm_axis=-1,
scope='LayerNorm')
return output
def head_projection(h, d_model, n_head, d_head, kernel_initializer, name):
"""Project hidden states to a specific head with a 4D-shape."""
proj_weight = tf.get_variable('{}/kernel'.format(name),
[d_model, n_head, d_head], dtype=h.dtype,
initializer=kernel_initializer)
head = tf.einsum('ibh,hnd->ibnd', h, proj_weight)
return head