11/17/24, 3:46 PM DL_20-WordEmbeddings.
ipynb - Colab
keyboard_arrow_down Word Embedding
https://www.tensorflow.org/tutorials/text/word_embeddings
http://projector.tensorflow.org/
keyboard_arrow_down Download the IMDb Dataset (Sentiment Analysis)
https://ai.stanford.edu/~amaas/data/sentiment/
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 80.2M 100 80.2M 0 0 1535k 0 0:00:53 0:00:53 --:--:-- 1631k
Path of folder
/content/aclImdb
keyboard_arrow_down Remove the folder specific to unsupervised category
! rm -rf /content/aclImdb/train/unsup
keyboard_arrow_down Data Preparation for Batch Processing
import tensorflow as tf
import numpy as np
batch_size = 32
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
"aclImdb/train",
batch_size=batch_size,
validation_split=0.2,
subset="training",
seed=1337,
)
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
"aclImdb/train",
batch_size=batch_size,
validation_split=0.2,
subset="validation",
seed=1337,
)
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
"aclImdb/test",
batch_size=batch_size
)
Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.
print(
"Number of batches in raw_train_ds: %d"
% tf.data.experimental.cardinality(raw_train_ds)
)
print(
"Number of batches in raw_val_ds: %d"
% tf.data.experimental.cardinality(raw_val_ds)
https://colab.research.google.com/drive/1E8Mc1HVRj4E1m__SKLjokXk_RenG8Kjh#printMode=true 1/6
11/17/24, 3:46 PM DL_20-WordEmbeddings.ipynb - Colab
)
print(
"Number of batches in raw_test_ds: %d"
% tf.data.experimental.cardinality(raw_test_ds)
)
Number of batches in raw_train_ds: 625
Number of batches in raw_val_ds: 157
Number of batches in raw_test_ds: 782
keyboard_arrow_down Numeric labels
Negative --> 0
Positive --> 1
for text_batch, label_batch in raw_train_ds.take(1):
for i in range(5):
print(text_batch.numpy()[i])
print(label_batch.numpy()[i])
b'I\'ve seen tons of science fiction from the 70s; some horrendously bad, and others thought provoking and truly frightening. Soylen
1
b'First than anything, I\'m not going to praise I\xc3\xb1arritu\'s short film, even I\'m Mexican and proud of his success in mainstr
1
b'Blood Castle (aka Scream of the Demon Lover, Altar of Blood, Ivanna--the best, but least exploitation cinema-sounding title, and s
1
b"I was talked into watching this movie by a friend who blubbered on about what a cute story this was.<br /><br />Yuck.<br /><br />I
0
b"Michelle Rodriguez is the defining actress who could be the charging force for other actresses to look out for. She has the audaci
1
keyboard_arrow_down Obtain first level numeric representation (index of vocabulary)
from tensorflow.keras.layers import TextVectorization
import string
import re
def custom_standardization(input_data):
lowercase = tf.strings.lower(input_data)
stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
return tf.strings.regex_replace(
stripped_html, "[%s]" % re.escape(string.punctuation), ""
)
# Model constants.
max_features = 10000
embedding_dim = 64
sequence_length = 50
# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
standardize=custom_standardization,
max_tokens=max_features,
output_mode="int",
output_sequence_length=sequence_length,
)
text_ds = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)
vectorize_layer.get_vocabulary()
['',
'[UNK]',
'the',
'and',
'a',
'of',
'to',
https://colab.research.google.com/drive/1E8Mc1HVRj4E1m__SKLjokXk_RenG8Kjh#printMode=true 2/6
11/17/24, 3:46 PM DL_20-WordEmbeddings.ipynb - Colab
'is',
'in',
'it',
'this',
'i',
'that',
'was',
'as',
'for',
'with',
'movie',
'but',
'film',
'on',
'not',
'you',
'his',
'are',
'have',
'he',
'be',
'one',
'its',
'at',
'all',
'by',
'an',
'they',
'who',
'from',
'so',
'like',
'her',
'or',
'just',
'about',
'has',
'if',
'out',
'some',
'there',
'what',
'good',
'when',
'more',
'very',
'even',
'she',
'my',
'up',
' '
print(len(vectorize_layer.get_vocabulary()))
10000
vectorize_layer(['Hi how']).numpy()
array([[9586, 88, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0]])
vectorize_layer([['Hi how are you?']]).numpy()
array([[9586, 88, 24, 22, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0]])
def vectorize_text(text, label):
text = tf.expand_dims(text, -1)
return vectorize_layer(text), label
# Vectorize the data.
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)
# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
val_ds = val_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)
https://colab.research.google.com/drive/1E8Mc1HVRj4E1m__SKLjokXk_RenG8Kjh#printMode=true 3/6
11/17/24, 3:46 PM DL_20-WordEmbeddings.ipynb - Colab
for text_batch, label_batch in train_ds.take(1):
print(text_batch.shape)
for i in range(2):
print(text_batch[i])
(32, 50)
tf.Tensor(
[ 11 1600 10 506 17 2 239 102 11 25 121 107 11 252
4 3550 1470 766 97 25 222 4 123 278 2 418 19 11
25 121 107 3 11 25 107 46 79 513 154 610 539 11
1456 8762 6 881 9 3 12 13], shape=(50,), dtype=int64)
tf.Tensor(
[ 11 1073 214 106 634 145 4 17 12 11 194 16 31 2
49 150 8 9 59 27 277 145 11 412 252 9 50 2
17 1023 3 11 65 417 57 312 48 65 1 13 1181 82
11 97 25 329 12 58 380 140], shape=(50,), dtype=int64)
Create the model specific to embedding layer
keyboard_arrow_down Convert integers to vectors of real numbers.
Dimension of the vector is a hyperparameter.
from tensorflow import keras
from tensorflow.keras import layers
# Input for variable-length sequences of integers
input_Embedding = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 64-dimensional vector
x = layers.Embedding(max_features, embedding_dim, name='embedding')(input_Embedding)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(16, activation="relu")(x)
# Add a classifier
output_Embedding = layers.Dense(1, activation="sigmoid")(x)
#output_Embedding = layers.Dense(2, activation="softmax")(x)
model_Embedding = keras.Model(input_Embedding, output_Embedding)
https://colab.research.google.com/drive/1E8Mc1HVRj4E1m__SKLjokXk_RenG8Kjh#printMode=true 4/6
11/17/24, 3:46 PM DL_20-WordEmbeddings.ipynb - Colab
model_Embedding.summary()
Model: "functional"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ Layer (type) ┃ Output Shape ┃ Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ input_layer (InputLayer) │ (None, None) │ 0 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ embedding (Embedding) │ (None, None, 64) │ 640,000 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ global_average_pooling1d │ (None, 64) │ 0 │
│ (GlobalAveragePooling1D) │ │ │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ dense (Dense) │ (None, 16) │ 1,040 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ dense_1 (Dense) │ (None, 1) │ 17 │
└──────────────────────────────────────┴─────────────────────────────┴─────────────────┘
Total params: 641,057 (2.45 MB)
Trainable params: 641,057 (2.45 MB)
Non-trainable params: 0 (0 00 B)
model_Embedding.compile(loss="binary_crossentropy",
optimizer="adam",
metrics=["accuracy"])
epochs = 3
# Fit the model using the train and test datasets.
model_Embedding.fit(train_ds, validation_data=val_ds, epochs=epochs)
Epoch 1/3
625/625 ━━━━━━━━━━━━━━━━━━━━ 11s 15ms/step - accuracy: 0.6480 - loss: 0.6232 - val_accuracy: 0.7730 - val_loss: 0.4684
Epoch 2/3
625/625 ━━━━━━━━━━━━━━━━━━━━ 6s 10ms/step - accuracy: 0.8201 - loss: 0.3971 - val_accuracy: 0.7770 - val_loss: 0.4693
Epoch 3/3
625/625 ━━━━━━━━━━━━━━━━━━━━ 10s 9ms/step - accuracy: 0.8546 - loss: 0.3323 - val_accuracy: 0.7744 - val_loss: 0.5021
<keras.src.callbacks.history.History at 0x7a23ac186d40>
weights = model_Embedding.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()
weights.shape
(10000, 64)
len(vocab)
10000
import io
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')
for index, word in enumerate(vocab):
if index == 0: continue # skip 0, it's padding.
vec = weights[index]
out_v.write('\t'.join([str(x) for x in vec]) + "\n")
out_m.write(word + "\n")
out_v.close()
out_m.close()
http://projector.tensorflow.org/
https://colab.research.google.com/drive/1E8Mc1HVRj4E1m__SKLjokXk_RenG8Kjh#printMode=true 5/6
11/17/24, 3:46 PM DL_20-WordEmbeddings.ipynb - Colab
https://colab.research.google.com/drive/1E8Mc1HVRj4E1m__SKLjokXk_RenG8Kjh#printMode=true 6/6