You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am using Google Colab to run on simple experiment.
The idea is to visulaize attention weights and predictions on text data.
Here is the code
!pip install witwidget
!pip install transformers
!pip install sentence_transformers
from transformers import BertTokenizer, BertForSequenceClassification,BertForMaskedLM
from sentence_transformers import SentenceTransformer
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
from transformers import RobertaTokenizer, RobertaForMaskedLM
import torch
import transformers
import sys
import pandas as pd
import numpy as np
tokenizer1=BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer2=BertTokenizer.from_pretrained('bert-base-cased')
model1=BertForSequenceClassification.from_pretrained('bert-base-uncased',return_dict=True)
model2=BertForSequenceClassification.from_pretrained('bert-base-cased',return_dict=True)
# list of strings
lst = [["learn the whatif too",1], ["trying to experiment with whatiftool",1],["Some weights of the model checkpoint at bert-base-uncased were not used",0],
["This IS expected if you are initializing BertForMaskedLM",0]]
# Calling DataFrame constructor on list
df = pd.DataFrame(lst,columns=['text','labels'])
df
text | labels
-- | --
learn the whatif too | 1
trying to experiment with whatiftool | 1
Some weights of the model checkpoint at bert-b... | 0
This IS expected if you are initializing BertF... | 0
#@title Define custom prediction functions so that WIT infers using keras models
import tensorflow as tf
# Set up model helper functions:
# Convert list of tf.Examples to list of comment strings.
def examples_to_strings(examples):
texts = [ex.features.feature['text'].bytes_list.value[0] for ex in examples]
labels=[ex.features.feature['labels'].int64_list.value[0] for ex in examples]
labels=torch.tensor(labels).unsqueeze(0)
if sys.version_info >= (3, 0):
texts = [t.decode('utf-8') for t in texts]
return texts,labels
# Get raw string out of tf.Example and prepare it for keras model input
def examples_to_model_in(examples, tokenizer):
texts,labels = examples_to_strings(examples)
print(texts,labels)
# Tokenize string into fixed length sequence of integer based on tokenizer
# and model padding
model_ins = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
# model_ins = tf.keras.preprocessing.sequence.pad_sequences(
# text_sequences, maxlen=PADDING_LEN)
return model_ins,labels
# WIT predict functions:
def custom_predict_1(examples_to_infer):
model_ins,labels = examples_to_model_in(examples_to_infer, tokenizer1)
preds = model1(**model_ins,labels=labels)
print(preds)
return preds
def custom_predict_2(examples_to_infer):
model_ins = examples_to_model_in(examples_to_infer, tokenizer2)
preds = model2(**model_ins,labels=labels)
return preds
def make_label_column_numeric(df, label_column, test):
df[label_column] = np.where(test(df[label_column]), 1, 0)
import numpy as np
import tensorflow as tf
# Converts a dataframe into a list of tf.Example protos.
def df_to_examples(df, columns=None):
examples = []
if columns == None:
columns = df.columns.values.tolist()
for index, row in df.iterrows():
example = tf.train.Example()
for col in columns:
if df[col].dtype is np.dtype(np.int64):
example.features.feature[col].int64_list.value.append(int(row[col]))
elif df[col].dtype is np.dtype(np.float64):
example.features.feature[col].float_list.value.append(row[col])
elif row[col] == row[col]:
example.features.feature[col].bytes_list.value.append(row[col].encode('utf-8'))
examples.append(example)
return examples
label_column = 'labels'
make_label_column_numeric(df, label_column, lambda val: val)
import tensorflow as tf
import tensorflow_hub as hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
# For this use-case, we set distance between datapoints to be cosine distance
# between unit-normalized embeddings of each datapoint from the tf.Hub
# Universal Sentence Encoder.
def universal_sentence_encoder_distance(input_example, examples_to_compare, _):
# Extract comment strings
input_sentence = examples_to_strings([input_example])[0]
sentences = examples_to_strings(examples_to_compare)
# Normalize all embeddings for cosine distance operation
input_emb = tf.squeeze(tf.nn.l2_normalize(embed([input_sentence]), axis=1))
sentences_emb = tf.nn.l2_normalize(embed(sentences), axis=1)
# Tile the input example for easy comparison to all examples
multiply = tf.constant([len(examples_to_compare)])
input_matrix = tf.reshape(tf.tile(input_emb, multiply),
[multiply[0], tf.shape(input_emb)[0]])
# Compute cosine distance from input example to all examples.
cosine_distance = tf.keras.losses.CosineSimilarity(
axis=1, reduction=tf.losses.Reduction.NONE)
distances = cosine_distance(sentences_emb, input_matrix)
results = tf.squeeze(distances)
return results.numpy().tolist()
custom_predict_1(examples)
['learn the whatif too', 'trying to experiment with whatiftool', 'Some weights of the model checkpoint at bert-base-uncased were not used', 'This IS expected if you are initializing BertForMaskedLM'] tensor([[1, 1, 0, 0]])
SequenceClassifierOutput(loss=tensor(0.7033, grad_fn=<NllLossBackward0>), logits=tensor([[-0.2599, -0.4994],
[-0.2791, -0.5455],
[-0.2001, -0.6373],
[-0.2547, -0.3226]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
SequenceClassifierOutput([('loss', tensor(0.7033, grad_fn=<NllLossBackward0>)),
('logits', tensor([[-0.2599, -0.4994],
[-0.2791, -0.5455],
[-0.2001, -0.6373],
[-0.2547, -0.3226]], grad_fn=<AddmmBackward0>))])
#@title Invoke What-If Tool for the data and two models (Note that this step may take a while due to prediction speed of the toxicity model){display-mode: "form"}
from witwidget.notebook.visualization import WitWidget, WitConfigBuilder
num_datapoints = 4 #@param {type: "number"}
tool_height_in_px = 720 #@param {type: "number"}
# Setup the tool with the test examples and the trained classifier
config_builder = WitConfigBuilder(examples[:num_datapoints]).set_custom_predict_fn(
custom_predict_1).set_compare_custom_predict_fn(custom_predict_2).set_custom_distance_fn(
universal_sentence_encoder_distance)
wv = WitWidget(config_builder, height=tool_height_in_px)
The text was updated successfully, but these errors were encountered:
I am using Google Colab to run on simple experiment.
The idea is to visulaize attention weights and predictions on text data.
Here is the code
The text was updated successfully, but these errors were encountered: