In [None]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer

## Helper Functions

In [None]:
def fast_encode(dataset, tokenizer, chunk_size=256, maxlen=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    lang_map = {
        "en":0,
        "es":1
    }
    all_ids = []
    all_langs = []
    for i in tqdm(range(0, len(dataset["comment_text"].values), chunk_size)):
        text_chunk = dataset["comment_text"][i:i+chunk_size].tolist()
        lang_chunk = dataset["lang"][i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
        all_langs.extend([lang_map[i] for i in lang_chunk])
    
    return np.array(all_ids), np.array(all_langs)

In [None]:
def fast_encode_en(dataset, tokenizer, chunk_size=256, maxlen=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    lang_map = {
        "en":0,
        "es":0
    }
    all_ids = []
    all_langs = []
    for i in tqdm(range(0, len(dataset["comment_text"].values), chunk_size)):
        text_chunk = dataset["comment_text"][i:i+chunk_size].tolist()
        lang_chunk = dataset["lang"][i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
        all_langs.extend([lang_map[i] for i in lang_chunk])
    
    return np.array(all_ids), np.array(all_langs)

In [None]:

def build_model(transformer, max_len=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_lang = Input(shape=(1,), dtype=tf.int32, name="input_lang")
    
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    hidden = Dense(100, activation='linear')(cls_token)
    out = Dense(1, activation='sigmoid')(hidden)
    model = Model(inputs=[input_word_ids,input_lang] , outputs=out)
    
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', loss_weights=[1, 2], metrics=['accuracy'])
    
    return model

In [None]:

def build_clf_puller_model(tuned_model, max_len=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    cls_token = tuned_model.get_layer("tf_bert_model").output[0][:, 0, :]
    model = Model(inputs=tuned_model.inputs , outputs=cls_token)
    
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:

def build_language_senstive_model(tuned_model, scale_rotate, vec_to_center, vec_to_translate, max_len=512):
    
    for layer in tuned_model.layers:
        layer.trainable = False
    
    cls_token = tuned_model.get_layer("tf_bert_model").output[0][:, 0, :]
    lang =  tuned_model.inputs[1]
    print (lang[:].shape)
    SR = tf.constant(scale_rotate)
    VTC = tf.constant(vec_to_center)
#     print (tf.gather(VTC, lang).shape)
    VTT = tf.constant(vec_to_translate)
    centered = cls_token - tf.gather(VTC, lang[:,0])
#     print (centered.shape)    

    transformed = tf.linalg.matmul (tf.expand_dims(centered, 1), tf.gather(SR, lang[:,0]))
#     print (tf.expand_dims(centered, 1).shape, tf.gather(SR, lang[:,0]).shape)    
#     print (transformed.shape)    
#     print (tf.squeeze(transformed, 1).shape, tf.gather(VTT, lang[:,0]).shape)    
    
    translated = tf.squeeze(transformed, 1) + VTT

    hidden = Dense(100, activation='linear')(translated)
    out = Dense(1, activation='sigmoid')(hidden)

    model = Model(inputs=tuned_model.inputs , outputs=out)
    
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

## TPU Configs

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

# Data access
GCS_DS_PATH = KaggleDatasets().get_gcs_path()

# Configuration
EPOCHS = 1
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192

## Create fast tokenizer

In [None]:
# First load the real tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-multilingual-cased')
# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')
# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

## Load text data into memory

In [None]:
import os
import pandas as pd
import numpy as np

############# train
train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
train2 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")
train2.toxic = train2.toxic.round().astype(int)  

train = pd.concat([
    train1[['comment_text', 'toxic']],
    train2[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n=60000, random_state=0)
])


lang = ["en"]*train.shape[0]
train["lang"] = lang


############# test
orig_valid = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = orig_valid.loc[orig_valid['lang'] == "es"]

# from IPython.display import clear_output
# clear_output()


In [None]:
train

In [None]:
x_train = fast_encode(train, fast_tokenizer, maxlen=MAX_LEN)
x_test = fast_encode(test, fast_tokenizer, maxlen=MAX_LEN)

x_test_en = fast_encode_en(test, fast_tokenizer, maxlen=MAX_LEN)

y_train = train.toxic.values
y_test = test.toxic.values

## Build datasets objects

In [None]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_test, y_test))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset_en = (
    tf.data.Dataset
    .from_tensor_slices((x_test_en, y_test))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)


In [None]:
for i in train_dataset:
    print (i)
    break

## Load model into the TPU

In [None]:
%%time
with strategy.scope():
    transformer_layer = (
        transformers.TFBertModel
        .from_pretrained('bert-base-multilingual-cased')
    )
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

## Train Model

First, we train on the subset of the training set, which is completely in English.

In [None]:
n_steps = x_train[0].shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    epochs=EPOCHS
)

In [None]:
%%time
with strategy.scope():
    
    clf_puller_model = build_clf_puller_model(model)
clf_puller_model.summary()

In [None]:
!wget -O 3000_words.txt --no-check-certificate "https://drive.google.com/uc?export=download&id=1F6DKWWdi5G95jtiQt79Kz9vHXV7HajFG"
!wget -O en_3000_glosses.txt --no-check-certificate "https://drive.google.com/uc?export=download&id=1-4Bhj5BUf56KlesmF4WySASp8kFA1o9v"
!wget -O es_3000_glosses.txt --no-check-certificate "https://drive.google.com/uc?export=download&id=1-IqUpetghkeOfCnFA_gGwwzD2zyl_EYd"

In [None]:
import pandas as pd

with open ("en_3000_glosses.txt", "r") as en_f:
  with open ("es_3000_glosses.txt", "r") as es_f:
    with open ("3000_words.txt", "r") as words_f:
      en_lines = en_f.read().splitlines()
      es_lines = es_f.read().splitlines()
      words_lines = words_f.read().splitlines()
      en_glosses = []
      es_glosses = []
      words = []
      for i in range (len(words_lines)):
        en_gloss = en_lines[i]
        es_gloss = es_lines[i]
        word = words_lines[i]

        if en_gloss != "-" and es_gloss != "-":
          en_glosses.append(en_gloss)
          es_glosses.append(es_gloss)
          words.append(word)

en_df = {
    "comment_text" : en_glosses,
    "lang" : ["en" for i in en_glosses]
    
}        

en_gloss_df = pd.DataFrame (en_df)


es_df = {
    "comment_text" : es_glosses,
    "lang" : ["es" for i in es_glosses]
    
}  

es_gloss_df = pd.DataFrame (es_df)

en_gloss_data = fast_encode(en_gloss_df, fast_tokenizer, maxlen=MAX_LEN)
es_gloss_data = fast_encode(es_gloss_df, fast_tokenizer, maxlen=MAX_LEN)


Now that we have pretty much saturated the learning potential of the model on english only data, we train it for one more epoch on the `validation` set, which is significantly smaller but contains a mixture of different languages.

## Submission

In [None]:
en_clf = clf_puller_model.predict(en_gloss_data, verbose=1)
es_clf = clf_puller_model.predict(es_gloss_data, verbose=1)

In [None]:
import numpy as np
en_centroid = np.average(en_clf, axis=0)
es_centroid = np.average(es_clf, axis=0)


en_centered = en_clf-en_centroid
es_centered = es_clf-es_centroid
# scale = np.average((np.sum(en_centered**2, axis=1)**0.5)/(np.sum(es_centered**2, axis=1)**0.5))

# scale
H = np.dot((en_centered).T , (es_centered))

U, sigma, VT = np.linalg.svd(H)
R = np.dot(VT.T, U.T)    
# # RS = scale*R
orig_diff = en_centered - es_centered

diff = (np.dot(es_centered, R))-en_centered
for i in range(diff.shape[0]):
  
  print (np.linalg.norm(diff[i]) - np.linalg.norm(orig_diff[i]))

In [None]:

scale_rotate = np.stack ([np.identity(R.shape[0]), R]).astype(np.float32)
vec_to_center = np.stack ([en_centroid, es_centroid]).astype(np.float32)
vec_to_translate = en_centroid.astype(np.float32)


In [None]:
%%time
with strategy.scope():
    
    language_senstive_model = build_language_senstive_model(model, scale_rotate, vec_to_center, vec_to_translate, max_len=512)
language_senstive_model.summary()

In [None]:
tf.keras.utils.plot_model(language_senstive_model)

In [None]:
n_steps = x_train[0].shape[0] // BATCH_SIZE
train_history = language_senstive_model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    epochs=EPOCHS
)

In [None]:
predictions = language_senstive_model.predict(test_dataset_en, verbose=1)

In [None]:
predictions = np.squeeze(predictions)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, np.round(predictions))

# predictions