Deep Learning / Spring 1399, Iran University of Science and Technology
Please pay attention to these notes:
########################################
# Put your implementation here #
########################################
Course Forum: https://groups.google.com/forum/#!forum/dl982/
Fill your information here & run the cell
#@title Enter your information & "RUN the cell!!" { run: "auto" }
student_id = 0#@param {type:"integer"}
student_name = "" #@param {type:"string"}
print("your student id:", student_id)
print("your name:", student_name)
from pathlib import Path
ASSIGNMENT_PATH = Path('asg04')
ASSIGNMENT_PATH.mkdir(parents=True, exist_ok=True)
In this assignment, you will:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
from tensorflow.keras.initializers import TruncatedNormal
import numpy as np
import pandas as pd
import tensorflow_datasets as tfds
In order to implement BERT, we should first implement the encoder layer of the transformer. An encoder has 2 main sub-layers: multi-headed attention layer and a simple feed-forward layer. The multi-headed attention layer is already implemented (slightly modified version of the one in the Keras tutorial), but you should implement the feedforward sub-layer and the residual connections (Add & Normalize blocks in the picture below) yourself.
</center>
class MultiHeadAttention(layers.Layer):
def __init__(self, hidden_size, num_heads):
super(MultiHeadAttention, self).__init__()
self.hidden_size = hidden_size
self.num_heads = num_heads
self.projection_dim = hidden_size // num_heads
self.Q = layers.Dense(hidden_size)
self.K = layers.Dense(hidden_size)
self.V = layers.Dense(hidden_size)
self.out = layers.Dense(hidden_size)
def attention(self, query, key, value, mask):
score = tf.matmul(query, key, transpose_b=True)
dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
scaled_score = score / tf.math.sqrt(dim_key)
maxlen = tf.cast(tf.shape(scaled_score)[-1], tf.int64)
m = tf.repeat(mask, maxlen, axis=2) * (-1e9)
scaled_score += m
weights = tf.nn.softmax(scaled_score, axis=-1)
output = tf.matmul(weights, value)
return output, weights
def separate_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, inputs, att_mask):
batch_size = tf.shape(inputs)[0]
query = self.separate_heads(self.Q(inputs) , batch_size)
key = self.separate_heads(self.K(inputs), batch_size)
value = self.separate_heads(self.V(inputs) , batch_size)
attention, self.att_weights = self.attention(query, key, value, att_mask)
attention = tf.transpose(attention, perm=[0, 2, 1, 3])
concat_attention = tf.reshape(attention, (batch_size, -1, self.hidden_size))
output = self.out(concat_attention)
return output
Question: Why does the transformer use multi-headed attention instead of just a single self-attention?
Write your answer here
The feed-forward sub-layer of the encoder has two dense layers. The first dense layer is called the "intermediate" layer and the second one is the "output" layer whose functionality is to down-project back to the hidden layer size. Dropout is also applied to the output of the intermediate layer. Unlike the original transformer, BERT uses "GELU" activation function in the intermediate dense layer. Since there is no GELU activation function in TensorFlow (there is one in TensorFlow Addons but it will crash your session!), you should implement it yourself!
Here is the GELU paper: https://arxiv.org/abs/1606.08415 . Or you can just search the internet!
@tf.function
def GELU(x):
########################################
# Put your implementation here #
########################################
class FFN(layers.Layer):
def __init__(self, intermediate_size, hidden_size, drop_rate):
super(FFN, self).__init__()
self.intermediate = layers.Dense(intermediate_size, activation=GELU, kernel_initializer=TruncatedNormal(stddev=0.02))
self.out = layers.Dense(hidden_size, kernel_initializer=TruncatedNormal(stddev=0.02))
self.drop = layers.Dropout(drop_rate)
def call(self, inputs):
########################################
# Put your implementation here #
########################################
In the encoder, dropout is applied to each sub-layer's output, then it gets added to the sub-layer's input (residual connection) and finaly goes through a layer normalizaion step. You should implement all the aforementioned steps in the AddNorm custom layer in the cell below!
class AddNorm(layers.Layer):
def __init__(self, LNepsilon, drop_rate):
super(AddNorm, self).__init__()
self.LN = layers.LayerNormalization(epsilon=LNepsilon)
self.dropout = layers.Dropout(drop_rate)
def call(self, sub_layer_in, sub_layer_out):
########################################
# Put your implementation here #
########################################
Now we have everything we need to implement an encoder layer!
class Encoder(layers.Layer):
def __init__(self, hidden_size, num_heads, intermediate_size, drop_rate=0.1, LNepsilon=1e-12):
super(Encoder, self).__init__()
self.attention = MultiHeadAttention(hidden_size, num_heads)
self.ffn = FFN(intermediate_size, hidden_size, drop_rate)
self.addnorm1 = AddNorm(LNepsilon, drop_rate)
self.addnorm2 = AddNorm(LNepsilon, drop_rate)
def call(self, inputs, mask):
########################################
# Put your implementation here #
########################################
def compute_mask(self, x, mask):
return mask
In the previous part, you implemented the encoder layer. We only need two more layers to implement BERT. First layer is the embedding layer. The final embedding for each token in BERT is the addition of three types of embeddings. Aside from token embeddings, there is also segment embeddings and position embeddings. For this assignment we are ignoring the segment embeddings since we only want to do single sentence classification!
Unlike the transformer, which uses fixed positional embeddings, BERT uses learned positional embeddings.
</center>
Note that layer normalization followed by dropout is applied to the final embeddings (after adding all the embeddings).
Question: What is segment embedding's functionality in BERT?
Write your answer here
class BertEmbedding(layers.Layer):
def __init__(self, vocab_size, maxlen, hidden_size):
super(BertEmbedding, self).__init__()
self.TokEmb = layers.Embedding(input_dim=vocab_size, output_dim=hidden_size, mask_zero=True)
self.PosEmb = tf.Variable(tf.random.truncated_normal(shape=(maxlen, hidden_size), stddev=0.02))
self.LN = layers.LayerNormalization(epsilon=1e-12)
self.dropout = layers.Dropout(0.1)
def call(self, inputs):
########################################
# Put your implementation here #
########################################
def compute_mask(self, x, mask=None):
m = 1-tf.cast(self.TokEmb.compute_mask(x), tf.float32)
m = m[:, tf.newaxis, tf.newaxis, :]
return m
The last layer you need to implement is the "pooler". The pooler converts the hidden states of the last encoder layer (which is of shape [batch_size, sequence_lenght, hidden_size]) to a vector representation (which is of shape [batch_size, hidden_size]) for each input sentence. The pooler does this by simply taking the hidden state corresponding to the first token (a special token in the beggining of each sentence) and feeding it to a dense layer (tanh is used as the activation function of this dense layer in the original implementation).
class Pooler(layers.Layer):
def __init__(self, hidden_size):
super(Pooler, self).__init__()
self.dense = layers.Dense(hidden_size, activation='tanh')
def call(self, encoder_out):
########################################
# Put your implementation here #
########################################
Question: As it was explained earlier, the pooler's job is to create a single vector representation of a sentence (or sentence pair) by taking the hidden state corresponding to the first token. Can you suggest another form of pooling that could work for BERT?
Write your answer here
Now you should complete the create_BERT function in the cell below. This function gets BERT's hyper-parameters as its inputs and return a BERT model. Use the functional api to create the model.
Note that the returned model must have two outputs (just like the pre-trained BERTs):
def create_BERT(vocab_size, maxlen, hidden_size, num_layers, num_att_heads, intermediate_size, drop_rate=0.1):
"""
creates a BERT model based on the arguments provided
Arguments:
vocab_size: number of words in the vocabulary
maxlen: maximum length of each sentence
hidden_size: dimension of the hidden state of each encoder layer
num_layers: number of encoder layers
num_att_heads: number of attention heads in the multi-headed attention layer
intermediate_size: dimension of the intermediate layer in the feed-forward sublayer of the encoders
drop_rate: dropout rate of all the dropout layers used in the model
returns:
model: a BERT model
"""
########################################
# Put your implementation here #
########################################
model = tf.keras.Model(inputs=?, outputs=[?, ?])
return model
The Rotten tomatoes critic reviews dataset is used for this assignment. This dataset consists of about 350000 short reviews.
!wget https://github.com/iust-deep-learning/982/raw/master/static_files/assignments/asg04_assets/reviews.zip
!unzip reviews.zip
train_reviews, test_reviews = pd.read_csv('train_reviews.csv').values[:, 1:], pd.read_csv('test_reviews.csv').values[:, 1:]
(train_texts, train_labels), (test_texts, test_labels) = (train_reviews[:,0],train_reviews[:,1]), (test_reviews[:,0],test_reviews[:,1])
train_texts = [s.lower() for s in train_texts]
test_texts = [s.lower() for s in test_texts]
We use the subword text tokenizer from TensorFlow datasets to train a tokenizer on the training set. The special token '[cls]' is reserved in the vocabulary of the tokenizer so we add it to the beggining of each sentence later.
aprx_vocab_size = 20000
cls_token = '[cls]'
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(corpus_generator=train_texts,
target_vocab_size=aprx_vocab_size,
reserved_tokens=[cls_token])
Now complete the encode_sentence function in the cell below. This function recieves a sentence and an integer denoting the maximum length of the sentence as inputs and returns a list of token ids. Here are the steps to implement this function:
def encode_sentence(s, maxlen):
########################################
# Put your implementation here #
#########################################
return tok_id_list
test_encoding = encode_sentence('This is a test sentence!', 10)
assert len(test_encoding) == 10 and test_encoding[:1] == tokenizer.encode(cls_token)
MAXLEN = 32
x_train = np.array([encode_sentence(x, MAXLEN) for x in train_texts], dtype=np.int64)
x_test = np.array([encode_sentence(x, MAXLEN) for x in test_texts], dtype=np.int64)
y_train = train_labels.astype(np.int64)
y_test = test_labels.astype(np.int64)
Now use the functional api and the create_BERT function you implemented earlier to create a classifier for the movie reviews dataset. Note that the intermediate layer in the feed-forward sub-layer of the encoders is set to $4\times H$ in the original BERT implementation, where $H$ is the hidden layer size.
## We use the BERT-Base hyper-parameters
hidden_size = 768
num_heads = 12
num_layers = 12
vocab_size = tokenizer.vocab_size
########################################
# Put your implementation here #
########################################
model = keras.Model(inputs=?, outputs=?)
model.compile(tf.keras.optimizers.Adam(learning_rate=5e-5), "binary_crossentropy", metrics=["accuracy"])
model.summary()
history = model.fit(
x_train,
y_train,
batch_size=128,
epochs=2,
validation_data=(x_test, y_test)
)
In this section, we'll use bertviz to visualize attention in the BERT model trained in the last section.
#@title Run this!
import sys
!test -d bertviz_repo && echo "FYI: bertviz_repo directory already exists, to pull latest version uncomment this line: !rm -r bertviz_repo"
# !rm -r bertviz_repo # Uncomment if you need a clean pull from repo
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
if not 'bertviz_repo' in sys.path:
sys.path += ['bertviz_repo']
from bertviz import head_view
def call_html():
import IPython
display(IPython.core.display.HTML('''
<script src="/static/components/requirejs/require.js"></script>
<script>
requirejs.config({
paths: {
base: '/static/base',
"d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.8/d3.min",
jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
},
});
</script>
'''))
In order to use bertviz, we need to get the attention weights in the encoders of the BERT model implemented in the last section. Now you should complete the get_att_weights function in the cell below. This function get two inputs, a model (the trained BERT-based model from last section) and a list of tokens (an encoded sentence). Here's what you should do:
def get_att_weights(model, tok_id_list):
########################################
# Put your implementation here #
########################################
return att_weights
test_sent = "Hello BERT!"
tok_id_list = encode_sentence(test_sent, MAXLEN)
att_weights = get_att_weights(model, tok_id_list)
assert len(att_weights) == num_layers
assert att_weights[0].shape == [1, num_heads, MAXLEN, MAXLEN]
import torch
def get_att_tok(model, sent):
maxlen = model.layers[0].input_shape[0][-1]
encoded_toks = encode_sentence(sent, maxlen)
att_weights = get_att_weights(model, encoded_toks)
pad_start_idx = np.min(np.where(np.array(encoded_toks) == 0))
toks = encoded_toks[:pad_start_idx]
atts = []
for att in att_weights:
layer_att = torch.FloatTensor(att[:, :, :pad_start_idx, :pad_start_idx].numpy())
atts.append(layer_att)
toks = [tokenizer.decode([m]) for m in toks]
return toks, atts
#@title Attentoin Heads Visualization
sent = "I hated that movie"#@param {type:"string"}
toks, atts = get_att_tok(model, sent.lower())
call_html()
head_view(atts, toks)
Up until now, we've only used BERT with randomly intialized weights. But we can achieve far better results by using pre-trained models since they've been pre-trained on huge corpora and already have a resonable understanding of language.
In this section, we'll use Huggingface's Transformers to fine-tune BERT-Base on the movie critic's reviews dataset.
!pip install transformers
from transformers import BertTokenizer, TFBertModel
## Load the tokenizer of BERT-Base
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
## Load the BERT-Base model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
def encode_sentences(s_list, maxlen):
toks, masks = [], []
for x in s_list:
enc = bert_tokenizer.encode_plus(x, max_length=maxlen, pad_to_max_length=True)
toks.append(enc['input_ids'])
masks.append(enc['attention_mask'])
return np.array(toks).astype(np.int64), np.array(masks).astype(np.int64)
Question: The pre-trained BERT model requires attention masks as well as the input token ids. What are these attention masks used for? How did we create them in our own implementation in the first part of the assignment?
Write your answer here
MAXLEN = 32
train_toks, train_masks = encode_sentences(train_texts, MAXLEN)
test_toks, test_masks = encode_sentences(test_texts, MAXLEN)
y_train = train_labels.astype(np.int64)
y_test = test_labels.astype(np.int64)
Read the documentaion of the Huggingface transformer library and create a classifier using BERT-Base in the cell below.
toks = tf.keras.layers.Input(shape=(MAXLEN), dtype=tf.int64)
masks = tf.keras.layers.Input(shape=(MAXLEN), dtype=tf.int64)
########################################
# Put your implementation here #
#######################################
pre_trained_model = tf.keras.Model(inputs=[toks, masks], outputs=[?])
pre_trained_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
loss='binary_crossentropy',
metrics=['acc'])
pre_trained_model.summary()
pre_trained_model.fit([train_toks, train_masks],
y_train,
batch_size=128,
epochs=1,
validation_data=([test_toks, test_masks], y_test))
Congratulations! You finished the assignment & you're ready to submit your work. Please follow the instructions:
dl_asg04__xx__xx.zip
) and upload it via https://forms.gle/sE4kFfDNVaDc7S1v8#@title
! pip install -U --quiet PyDrive > /dev/null
# ! wget -q https://github.com/github/hub/releases/download/v2.10.0/hub-linux-amd64-2.10.0.tgz
import os
import time
import yaml
import json
from google.colab import files
from IPython.display import Javascript
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
asg_name = 'assignment_4'
script_save = '''
require(["base/js/namespace"],function(Jupyter) {
Jupyter.notebook.save_checkpoint();
});
'''
# repo_name = 'iust-deep-learning-assignments'
submission_file_name = 'dl_asg04__%s__%s.zip'%(student_id, student_name.lower().replace(' ', '_'))
sub_info = {
'student_id': student_id,
'student_name': student_name,
'dateime': str(time.time()),
'asg_name': asg_name
}
json.dump(sub_info, open('info.json', 'w'))
Javascript(script_save)
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
file_id = drive.ListFile({'q':"title='%s.ipynb'"%asg_name}).GetList()[0]['id']
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('%s.ipynb'%asg_name)
! jupyter nbconvert --to script "$asg_name".ipynb > /dev/null
! jupyter nbconvert --to html "$asg_name".ipynb > /dev/null
! zip "$submission_file_name" "$asg_name".ipynb "$asg_name".html "$asg_name".txt info.json > /dev/null
print("##########################################")
print("Done! Submisson created, Please download using the bellow cell!")
files.download(submission_file_name)