xlnet代码学习

xlnet

Posted by BY 黄梓林 on February 20, 2020

xlnet建模函数


查找对应词的嵌入向量

def embedding_lookup(x, n_token, d_embed, initializer, use_tpu=True,
                     scope='embedding', reuse=None, dtype=tf.float32):
  """TPU and GPU embedding_lookup function."""
  with tf.variable_scope(scope, reuse=reuse):
    lookup_table = tf.get_variable('lookup_table', [n_token, d_embed],
                                   dtype=dtype, initializer=initializer)
    if use_tpu:
      one_hot_idx = tf.one_hot(x, n_token, dtype=dtype)
      if one_hot_idx.shape.ndims == 2:
        return tf.einsum('in,nd->id', one_hot_idx, lookup_table), lookup_table
      else:
        return tf.einsum('ibn,nd->ibd', one_hot_idx, lookup_table), lookup_table
    else:
      return tf.nn.embedding_lookup(lookup_table, x), lookup_table


获得位置嵌入向量

def positional_embedding(pos_seq, inv_freq, bsz=None):
  sinusoid_inp = tf.einsum('i,d->id', pos_seq, inv_freq)
  pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
  pos_emb = pos_emb[:, None, :]

  if bsz is not None:
    pos_emb = tf.tile(pos_emb, [1, bsz, 1])

  return pos_emb


全连接层->dropout层-> 全连接层->dropout层->残差连接

def positionwise_ffn(inp, d_model, d_inner, dropout, kernel_initializer,
                     activation_type='relu', scope='ff', is_training=True,
                     reuse=None):
  """Position-wise Feed-forward Network."""
  if activation_type == 'relu':
    activation = tf.nn.relu
  elif activation_type == 'gelu':
    activation = gelu
  else:
    raise ValueError('Unsupported activation type {}'.format(activation_type))

  output = inp
  with tf.variable_scope(scope, reuse=reuse):
    output = tf.layers.dense(output, d_inner, activation=activation,
                             kernel_initializer=kernel_initializer,
                             name='layer_1')
    output = tf.layers.dropout(output, dropout, training=is_training,
                               name='drop_1')
    output = tf.layers.dense(output, d_model,
                             kernel_initializer=kernel_initializer,
                             name='layer_2')
    output = tf.layers.dropout(output, dropout, training=is_training,
                               name='drop_2')
    output = tf.contrib.layers.layer_norm(output + inp, begin_norm_axis=-1,
                                          scope='LayerNorm')
  return output


 def head_projection(h, d_model, n_head, d_head, kernel_initializer, name):
  """Project hidden states to a specific head with a 4D-shape."""
  proj_weight = tf.get_variable('{}/kernel'.format(name),
                                [d_model, n_head, d_head], dtype=h.dtype,
                                initializer=kernel_initializer)
  head = tf.einsum('ibh,hnd->ibnd', h, proj_weight)

  return head