import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, LSTM,
Dense, Dropout, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
print('import done')
# Define hyperparameters
MAX_SEQ_LENGTH = 100
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
FILTER_SIZES = [3, 5, 7]
NUM_FILTERS = 256
LSTM_UNITS = 256
DENSE_UNITS = 1
DROPOUT_RATE = 0.5
PATIENCE = 10
print('config done')
# Load the dataset
df =
pd.read_csv("/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv",
encoding='latin-1', header=None)
df.columns = ["sentiment", "id", "date", "query", "user", "text"]
df = df[["sentiment", "text"]]
df["sentiment"] = df["sentiment"].replace({0: "negative", 4: "positive"})
texts = df["text"].values
labels = df["sentiment"].values
labels = np.array([1 if label == "positive" else 0 for label in labels])
print(df.head(10))
# Preprocess text data
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH)
# Load pre-trained word embeddings (e.g., GloVe)
embedding_dim = 100
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
for line in f:
values = line.split()
word = values[0]
coefficients = np.asarray(values[1:], dtype='float32')
embedding_index[word] = coefficients
# Create embedding matrix
num_words = min(MAX_NB_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in word_index.items():
if i >= MAX_NB_WORDS:
continue
embedding_vector = embedding_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
# Define model architecture
inputs = Input(shape=(MAX_SEQ_LENGTH,))
embedding = Embedding(input_dim=num_words, output_dim=embedding_dim,
input_length=MAX_SEQ_LENGTH, weights=[embedding_matrix], trainable=False)(inputs)
conv_layers = []
for filter_size in FILTER_SIZES:
conv = Conv1D(filters=NUM_FILTERS, kernel_size=filter_size, activation='relu')
(embedding)
pool = MaxPooling1D(pool_size=MAX_SEQ_LENGTH - filter_size + 1)(conv)
conv_layers.append(pool)
concat = Concatenate()(conv_layers)
lstm = LSTM(units=LSTM_UNITS)(concat)
dropout = Dropout(rate=DROPOUT_RATE)(lstm)
outputs = Dense(units=DENSE_UNITS, activation='sigmoid')(dropout)
model = Model(inputs=inputs, outputs=outputs)
optimizer = RMSprop(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy',
metrics=['accuracy'])
model.summary()
# Train the model with early stopping
es = EarlyStopping(monitor='val_accuracy', patience=PATIENCE, mode='max',
min_delta=0.01, baseline=0.85)
history = model.fit(data, labels, epochs=50, validation_split=0.3, callbacks=[es])
# Plot accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()
# Make predictions on new data
new_texts = ["is upset that he can't update his Facebook by texting it... and might
cry as a result School today ...",
"@Kenichan I dived many times for the ball. Managed to save 50% The
rest go out of bounds",
"my whole body feels itchy and like its on fire",
"@nationwideclass no, it's not behaving at all. i'm mad. why am i
here? because I can't see you all o...",
"@Kwesidei not the whole crew",
"@LettyA ahh ive always wanted to see rent love the soundtrack!!",
"@FakerPattyPattz Oh dear. Were you drinking out of the forgotten
table drinks? "]
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_data = pad_sequences(new_sequences, maxlen=MAX_SEQ_LENGTH)
predictions = model.predict(new_data)
# Evaluate the model
y_pred = np.round(predictions)
y_true = np.array([0, 0, 0, 0, 0, 0, 0]) # true labels of new data
cm = confusion_matrix(y_true, y_pred)
print(cm)