import pandas as pd
import re
from textblob import TextBlob
import matplotlib.pyplot as plt
# 2
# Load dataset
file_path = '/content/Nri_Textual_Survey_Data.csv' # Replace with your
file path
survey_data = pd.read_csv(file_path)
# 3
# ### 1. Text Preprocessing ###
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
# Update preprocessing function to include stop word removal
def preprocess_text_with_stopwords(text):
"""Cleans text by converting to lowercase, removing punctuation,
extra whitespace, and stop words."""
try:
# Convert to lowercase and remove punctuation
text = str(text).lower()
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
text = re.sub(r'\d+', '', text) # Remove digits
text = re.sub(r'http\S+|www\S+|https\S+', '', text) # Remove
URLs
text = re.sub(r'<.*?>', '', text) # Remove HTML tags
text = re.sub(r'\n', ' ', text) # Replace newlines with spaces
text = re.sub(r'[^\x00-\x7F]+', '', text) # Remove non-ASCII
characters
# Remove stop words
text = " ".join([word for word in text.split() if word not in
ENGLISH_STOP_WORDS])
return text
except Exception as e:
return text
# Apply updated preprocessing
processed_data = survey_data.copy()
for col in processed_data.columns:
processed_data[col] =
processed_data[col].apply(preprocess_text_with_stopwords)
# 4
processed_data.to_csv('preprocessed_data.csv', index=False) # Save to
Colab environment
# Download the file
from google.colab import files
files.download('preprocessed_data.csv')
# 5
### 2. Sentiment Analysis ###
def analyze_sentiment(text):
"""Classifies sentiment as 'happy', 'neutral', or 'unhappy'."""
try:
blob = TextBlob(text)
polarity = blob.sentiment.polarity # Polarity ranges from -1
(negative) to 1 (positive)
if polarity > 0:
return 'happy'
elif polarity == 0:
return 'neutral'
else:
return 'unhappy'
except Exception:
return 'neutral'
# Add sentiment columns for each facility
sentiment_data = processed_data.copy()
for col in sentiment_data.columns:
sentiment_data[col + '_sentiment'] =
sentiment_data[col].apply(analyze_sentiment)
# 6
### 3. Sentiment Analysis Summary ###
# Count sentiments for each facility
facility_sentiment_cols = [col for col in sentiment_data.columns if
'_sentiment' in col]
sentiment_summary =
sentiment_data[facility_sentiment_cols].apply(pd.Series.value_counts).f
illna(0).astype(int)
sentiment_summary = sentiment_summary.T
sentiment_summary.columns = ['happy', 'neutral', 'unhappy']
# Overall sentiment counts
overall_sentiment_counts = sentiment_summary.sum()
# 7
### Visualization ###
# Overall Sentiment Distribution (Bar and Pie Charts)
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
overall_sentiment_counts.plot(kind='bar', color=['green', 'orange',
'red'], ax=axes[0])
axes[0].set_title('Overall Sentiment Distribution (Bar Chart)')
axes[0].set_xlabel('Sentiment')
axes[0].set_ylabel('Count')
overall_sentiment_counts.plot(kind='pie', autopct='%1.1f%%',
colors=['green', 'orange', 'red'], ax=axes[1])
axes[1].set_title('Overall Sentiment Distribution (Pie Chart)')
axes[1].set_ylabel('')
plt.tight_layout()
plt.show()
# 8
# Facility-Wise Sentiment Distribution (Stacked Bar Chart)
sentiment_summary.plot(
kind='bar',
stacked=True,
figsize=(12, 8),
title='Facility-Wise Sentiment Distribution',
color=['green', 'orange', 'red']
)
plt.xlabel('Facilities')
plt.ylabel('Count')
plt.legend(title='Sentiment')
plt.show()
# 9
# Individual Facility Sentiment Pie Charts
rows = (len(sentiment_summary) // 3) + (1 if len(sentiment_summary) % 3
else 0)
fig, axes = plt.subplots(rows, 3, figsize=(18, 5 * rows))
axes = axes.flatten()
for idx, facility in enumerate(sentiment_summary.index):
sentiment_summary.loc[facility].plot(
kind='pie',
ax=axes[idx],
autopct='%1.1f%%',
colors=['green', 'orange', 'red'],
title=f'{facility} Sentiment Distribution'
)
axes[idx].set_ylabel('')
for ax in axes[len(sentiment_summary):]:
ax.axis('off')
plt.tight_layout()
plt.show()
# 10
from sklearn.feature_extraction.text import CountVectorizer,
TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd
# Assuming 'processed_data' contains the preprocessed text data
# Dynamically set the column name
column_name = 'Internship' # Replace this with your desired column
# Bag-of-Words Extraction
bow_vectorizer = CountVectorizer(max_features=1000) # Limit vocabulary
size
bow_features =
bow_vectorizer.fit_transform(processed_data[column_name]) # Use
dynamic column name
bow_term_frequencies = bow_features.sum(axis=0).A1 # Convert sparse
matrix to array
# Create BoW DataFrame
bow_term_df = pd.DataFrame({
'Term': bow_vectorizer.get_feature_names_out(),
'Frequency': bow_term_frequencies
}).sort_values(by='Frequency', ascending=False)
# TF-IDF Extraction
tfidf_vectorizer = TfidfVectorizer(max_features=1000) # Limit
vocabulary size
tfidf_features =
tfidf_vectorizer.fit_transform(processed_data[column_name]) # Use
dynamic column name
tfidf_term_scores = tfidf_features.sum(axis=0).A1 # Convert sparse
matrix to array
# Create TF-IDF DataFrame
tfidf_term_df = pd.DataFrame({
'Term': tfidf_vectorizer.get_feature_names_out(),
'TF-IDF Score': tfidf_term_scores
}).sort_values(by='TF-IDF Score', ascending=False)
# Merge BoW and TF-IDF for comparison
comparison_df = pd.merge(
bow_term_df.rename(columns={"Frequency":
"Frequency_BoW"}).head(20),
tfidf_term_df.rename(columns={"TF-IDF Score": "Frequency_TF-
IDF"}).head(20),
on="Term",
how="outer"
).fillna(0)
# Sort by Bag-of-Words Frequency for consistency
comparison_df = comparison_df.sort_values(by="Frequency_BoW",
ascending=False)
# Plot the comparison graph
plt.figure(figsize=(12, 8))
# Bar width for side-by-side bars
bar_width = 0.35
index = range(len(comparison_df))
# Bag-of-Words Bar
plt.bar(index, comparison_df["Frequency_BoW"], bar_width, label="Bag-
of-Words", color="skyblue")
# TF-IDF Bar
plt.bar([i + bar_width for i in index], comparison_df["Frequency_TF-
IDF"], bar_width, label="TF-IDF", color="orange")
# Add labels and title
plt.xlabel("Terms")
plt.ylabel("Frequency/TF-IDF Score")
plt.title(f"Comparison of Bag-of-Words and TF-IDF Representations for
'{column_name}'")
plt.xticks([i + bar_width / 2 for i in index], comparison_df["Term"],
rotation=45, ha="right")
plt.legend()
plt.tight_layout()
plt.show()
# 11
from sklearn.model_selection import train_test_split
# Target variable based on the dynamic column name
target_column_name = column_name + '_sentiment' # Append '_sentiment'
dynamically
y = sentiment_data[target_column_name] # Use the dynamic sentiment
column name
# Split for BoW features
X_train_bow, X_test_bow, y_train, y_test = train_test_split(
bow_features, y, test_size=0.2, random_state=42
)
# Split for TF-IDF features
X_train_tfidf, X_test_tfidf, _, _ = train_test_split(
tfidf_features, y, test_size=0.2, random_state=42
)
# 12
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# Initialize models
log_reg = LogisticRegression(max_iter=1000, random_state=42)
random_forest = RandomForestClassifier(random_state=42)
svm = SVC(kernel='linear', random_state=42)
# Train models on BoW features
log_reg.fit(X_train_bow, y_train)
random_forest.fit(X_train_bow, y_train)
svm.fit(X_train_bow, y_train)
# 13
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score
# Dynamically calculate model accuracies
bow_accuracies = [
accuracy_score(y_test, log_reg.predict(X_test_bow)),
accuracy_score(y_test, random_forest.predict(X_test_bow)),
accuracy_score(y_test, svm.predict(X_test_bow))
]
tfidf_accuracies = [
accuracy_score(y_test, log_reg.predict(X_test_tfidf)),
accuracy_score(y_test, random_forest.predict(X_test_tfidf)),
accuracy_score(y_test, svm.predict(X_test_tfidf))
]
# Plotting model accuracy comparison
model_names = ['Logistic Regression', 'Random Forest', 'SVM']
x = range(len(model_names))
bar_width = 0.35
plt.figure(figsize=(10, 6))
plt.bar(x, bow_accuracies, width=bar_width, label='BoW',
color='skyblue')
plt.bar([i + bar_width for i in x], tfidf_accuracies, width=bar_width,
label='TF-IDF', color='orange')
# Add labels and title
plt.xticks([i + bar_width / 2 for i in x], model_names)
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison (BoW vs. TF-IDF)')
plt.legend()
plt.tight_layout()
plt.show()
# 14
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score
# Dynamically calculate metrics for a specific model (e.g., Logistic
Regression)
bow_scores = [
precision_score(y_test, log_reg.predict(X_test_bow),
average='weighted'),
recall_score(y_test, log_reg.predict(X_test_bow),
average='weighted'),
f1_score(y_test, log_reg.predict(X_test_bow), average='weighted')
]
tfidf_scores = [
precision_score(y_test, log_reg.predict(X_test_tfidf),
average='weighted'),
recall_score(y_test, log_reg.predict(X_test_tfidf),
average='weighted'),
f1_score(y_test, log_reg.predict(X_test_tfidf), average='weighted')
]
# Plot grouped bar chart
metrics = ['Precision', 'Recall', 'F1-Score']
x = np.arange(len(metrics))
bar_width = 0.35
plt.figure(figsize=(10, 6))
plt.bar(x, bow_scores, width=bar_width, label='BoW', color='skyblue')
plt.bar(x + bar_width, tfidf_scores, width=bar_width, label='TF-IDF',
color='orange')
# Add labels and title
plt.xticks(x + bar_width / 2, metrics)
plt.ylabel('Score')
plt.title('Model Performance Metrics (BoW vs. TF-IDF)')
plt.legend()
plt.tight_layout()
plt.show()
# 15
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
# Predictions for BoW features
y_pred_log_reg_bow = log_reg.predict(X_test_bow)
y_pred_rf_bow = random_forest.predict(X_test_bow)
y_pred_svm_bow = svm.predict(X_test_bow)
# Predictions for TF-IDF features
y_pred_log_reg_tfidf = log_reg.predict(X_test_tfidf)
y_pred_rf_tfidf = random_forest.predict(X_test_tfidf)
y_pred_svm_tfidf = svm.predict(X_test_tfidf)
# Confusion Matrices
cm_log_reg_bow = confusion_matrix(y_test, y_pred_log_reg_bow)
cm_rf_bow = confusion_matrix(y_test, y_pred_rf_bow)
cm_svm_bow = confusion_matrix(y_test, y_pred_svm_bow)
cm_log_reg_tfidf = confusion_matrix(y_test, y_pred_log_reg_tfidf)
cm_rf_tfidf = confusion_matrix(y_test, y_pred_rf_tfidf)
cm_svm_tfidf = confusion_matrix(y_test, y_pred_svm_tfidf)
# Plotting all confusion matrices
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
# Titles for the matrices
titles = [
"Logistic Regression (BoW)", "Random Forest (BoW)", "SVM (BoW)",
"Logistic Regression (TF-IDF)", "Random Forest (TF-IDF)", "SVM (TF-
IDF)"
]
# All confusion matrices
conf_matrices = [
cm_log_reg_bow, cm_rf_bow, cm_svm_bow,
cm_log_reg_tfidf, cm_rf_tfidf, cm_svm_tfidf
]
# Plotting each heatmap
for i, ax in enumerate(axes.flat):
sns.heatmap(
conf_matrices[i], annot=True, fmt='d', cmap='Blues',
xticklabels=['Happy', 'Neutral', 'Unhappy'],
yticklabels=['Happy', 'Neutral', 'Unhappy'], ax=ax
)
ax.set_title(titles[i])
ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")
plt.tight_layout()
plt.show()
from sklearn.metrics.pairwise import cosine_similarity
# 16
# Step 1: Use the already defined `processed_data` from your script.
# Step 2: Combine text from all columns to build a unified vocabulary
from sklearn.metrics.pairwise import cosine_similarity
combined_text_all = processed_data.apply(
lambda row: ' '.join(row.astype(str)), axis=1
)
# Fit the TF-IDF vectorizer on the combined text
tfidf_vectorizer_all = TfidfVectorizer()
tfidf_vectorizer_all.fit(combined_text_all)
# Step 3: Transform each column using the unified vocabulary
tfidf_vectors_all = {col:
tfidf_vectorizer_all.transform(processed_data[col].astype(str)) for col
in processed_data.columns}
# Step 4: Compute Pairwise Cosine Similarity for All Labels
similarity_matrix_all = np.zeros((len(processed_data.columns),
len(processed_data.columns)))
for i, col1 in enumerate(processed_data.columns):
for j, col2 in enumerate(processed_data.columns):
if i == j: # Self-similarity
similarity_matrix_all[i, j] = 1.0
else: # Pairwise similarity
similarity_matrix_all[i, j] = cosine_similarity(
tfidf_vectors_all[col1], tfidf_vectors_all[col2]
).mean()
# Step 5: Visualize the Similarity Matrix for All Labels
plt.figure(figsize=(12, 10))
sns.heatmap(
similarity_matrix_all,
xticklabels=processed_data.columns,
yticklabels=processed_data.columns,
cmap='coolwarm',
annot=True,
fmt=".2f",
annot_kws={"size": 10}, # Customize annotation font size
cbar_kws={"shrink": 0.8, "label": "Similarity Score"} # Color bar
customization
)
plt.title("Text Similarity Between All Labels (Cosine Similarity)",
fontsize=16)
plt.xlabel("Labels", fontsize=12)
plt.ylabel("Labels", fontsize=12)
plt.xticks(rotation=45, ha='right', fontsize=10) # Rotate x-axis
labels for better readability
plt.yticks(fontsize=10)
plt.tight_layout()
plt.show()
# Step 6: Identify the Most and Least Similar Pairs Across All Labels
similarity_df_all = pd.DataFrame(
similarity_matrix_all,
index=processed_data.columns,
columns=processed_data.columns
)
# Melt the matrix for pairwise comparison
similarity_melted_all = similarity_df_all.reset_index().melt(
id_vars='index',
var_name='Label 2',
value_name='Similarity'
).rename(columns={'index': 'Label 1'})
# Remove self-similarity (diagonal values)
similarity_melted_all =
similarity_melted_all[similarity_melted_all['Label 1'] !=
similarity_melted_all['Label 2']]
# Sort for most and least similar pairs
most_similar_all = similarity_melted_all.sort_values(by='Similarity',
ascending=False).head(1)
least_similar_all = similarity_melted_all.sort_values(by='Similarity',
ascending=True).head(1)
# Output results
print("Most Similar Pair:")
print(most_similar_all)
print("\nLeast Similar Pair:")
print(least_similar_all)
# 17
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay,
accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
from torch.optim import AdamW
from tqdm import tqdm
import matplotlib.pyplot as plt
# Step 1: Load and Preprocess Dataset
file_path = '/content/Nri_Textual_Survey_Data.csv' # Replace with your
dataset path
data = pd.read_csv(file_path)
# Preprocess text
def preprocess_text_dl(text_dl):
"""Clean text."""
text_dl = str(text_dl).lower()
text_dl = re.sub(r'[^\w\s]', '', text_dl) # Remove punctuation
text_dl = re.sub(r'\s+', ' ', text_dl).strip() # Remove extra
spaces
return text_dl
# Apply preprocessing to text column dynamically
text_column = data.columns[0] # Dynamically use the first column as
text
data[text_column] = data[text_column].apply(preprocess_text_dl)
# Analyze sentiment dynamically
def analyze_sentiment(text_dl):
"""Classify sentiment using polarity."""
from textblob import TextBlob
try:
blob = TextBlob(text_dl)
polarity = blob.sentiment.polarity
if polarity > 0:
return 0 # Happy
elif polarity == 0:
return 1 # Neutral
else:
return 2 # Unhappy
except:
return 1
data['label'] = data[text_column].apply(analyze_sentiment)
# Step 2: Split Data
train_texts, test_texts, train_labels, test_labels = train_test_split(
data[text_column], data['label'], test_size=0.2, random_state=42
)
# Step 3: Tokenize Using BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
class SentimentDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_len=128):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts.iloc[idx]
label = self.labels.iloc[idx]
encoding = self.tokenizer(
text,
truncation=True,
padding='max_length',
max_length=self.max_len,
return_tensors="pt"
)
return {
'input_ids': encoding['input_ids'].squeeze(0),
'attention_mask': encoding['attention_mask'].squeeze(0),
'labels': torch.tensor(label, dtype=torch.long)
}
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
test_dataset = SentimentDataset(test_texts, test_labels, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)
# Step 4: Define BERT Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained('bert-base-
uncased', num_labels=3)
model.to(device)
# Step 5: Train Model
optimizer = AdamW(model.parameters(), lr=5e-5)
epochs = 1
model.train()
for epoch in range(epochs):
total_loss = 0
for batch in tqdm(train_loader, desc=f"Training Epoch {epoch +
1}"):
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids, attention_mask=attention_mask,
labels=labels)
loss = outputs.loss
total_loss += loss.item()
loss.backward()
optimizer.step()
print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")
# Step 6: Evaluate Model
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
for batch in tqdm(test_loader, desc="Evaluating"):
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids, attention_mask=attention_mask)
preds = torch.argmax(outputs.logits, axis=1)
all_preds.extend(preds.cpu().numpy())
all_labels.extend(labels.cpu().numpy())
# Calculate Accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
# Confusion Matrix
cm = confusion_matrix(all_labels, all_preds, labels=[0, 1, 2])
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
display_labels=["Happy", "Neutral", "Unhappy"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix - Sentiment Analysis")
plt.show()