Natural Language Processing
Lab Assignment
R.BhanuKiran
22BCE9560
L45+L46
1. Implement Text pre-processing on Brown corpus &
Gutenberg corpus and display the list of tokens(count), list
of sentences(count), count of paragraphs, list & count of
unique words, list & count of rare words, list & count of
stop-words separately for each corpus.
2. import nltk
3. from nltk.corpus import brown, gutenberg, stopwords
4. from nltk.tokenize import word_tokenize, sent_tokenize
5. from collections import Counter
6. import string
7. nltk.download('punkt_tab')
8. nltk.download('brown')
9. nltk.download('gutenberg')
10. nltk.download('punkt')
11. nltk.download('stopwords')
12. stop_words = set(stopwords.words('english'))
13. punctuations = set(string.punctuation)
14. def preprocess_and_analyze_corpus(corpus,
corpus_name="Corpus"):
15. print(f"\n=== 📘 Analyzing {corpus_name} ===")
16. if corpus_name == "Brown":
17. raw_text = " ".join(brown.words())
18. fileids = brown.fileids()
19. elif corpus_name == "Gutenberg":
20. raw_text = " ".join(gutenberg.words())
21. fileids = gutenberg.fileids()
22. else:
23. return
24. tokens = word_tokenize(raw_text)
25. sents = sent_tokenize(raw_text)
26. cleaned_tokens = [
27. token.lower() for token in tokens
28. if token.lower() not in stop_words and token not in
punctuations and token.isalpha()
29. ]b
30. token_counts = Counter(cleaned_tokens)
31. unique_words = list(token_counts.keys())
32. rare_words = [word for word, count in
token_counts.items() if count == 1]
33. stopword_list = [token for token in tokens if
token.lower() in stop_words]
34. print(f"Total Tokens: {len(tokens)}")
35. print(f"Sample Tokens: {tokens[:10]}")
36. print(f"Total Sentences: {len(sents)}")
37. print(f"Sample Sentences: {sents[:2]}")
38. print(f"Total Paragraphs (FileIDs): {len(fileids)}")
39. print(f"FileIDs (Used as Paragraphs): {fileids[:5]}")
40. print(f"Unique Words Count: {len(unique_words)}")
41. print(f"Unique Words Sample: {unique_words[:10]}")
42. print(f"Rare Words Count: {len(rare_words)}")
43. print(f"Rare Words Sample: {rare_words[:10]}")
44. print(f"Stopwords Count: {len(stopword_list)}")
45. print(f"Stopwords Sample: {stopword_list[:10]}")
46. print("\n" + "-"*60)
47. preprocess_and_analyze_corpus(brown, corpus_name="Brown")
48. preprocess_and_analyze_corpus(gutenberg,
corpus_name="Gutenberg")
49.
OUTPUT:
2. Perform Text summarization, POS
Tagging and NER modelling on the post-processed corpus by the following neural
networks.
1.Feed forward Neural network
import torch
import torch.nn as nn
import torch.optim as optim
from collections import OrderedDict
data = [
(["I", "love", "coding"], ["PRON", "VERB", "NOUN"]),
(["She", "writes", "code"], ["PRON", "VERB", "NOUN"]),
(["They", "play", "football"], ["PRON", "VERB", "NOUN"]),
]
words = sorted(set(word for sent, _ in data for word in sent))
tags = sorted(set(tag for _, t in data for tag in t))
word2idx = {word: i for i, word in enumerate(words)}
tag2idx = {tag: i for i, tag in enumerate(tags)}
idx2tag = {i: tag for tag, i in tag2idx.items()}
X = []
y = []
for sent, tag_seq in data:
for word, tag in zip(sent, tag_seq):
X.append(word2idx[word])
y.append(tag2idx[tag])
X = torch.tensor(X)
y = torch.tensor(y)
class FFNN_POS(nn.Module):
def __init__(self, vocab_size, tagset_size, emb_dim=32,
hidden_dim=64):
super(FFNN_POS, self).__init__()
self.embedding = nn.Embedding(vocab_size, emb_dim)
self.fc1 = nn.Linear(emb_dim, hidden_dim)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_dim, tagset_size)
def forward(self, x):
x = self.embedding(x)
x = self.relu(self.fc1(x))
x = self.fc2(x)
return x
model = FFNN_POS(len(word2idx), len(tag2idx))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
# Training
for epoch in range(100):
optimizer.zero_grad()
outputs = model(X)
loss = criterion(outputs, y)
loss.backward()
optimizer.step()
# Testing
model.eval()
with torch.no_grad():
test_word = "love"
test_input = torch.tensor([word2idx[test_word]]) # shape:
[1]
pred = model(test_input)
predicted_tag = idx2tag[torch.argmax(pred).item()]
print(f"Prediction for '{test_word}': {predicted_tag}")
OUTPUT:
2.Recurrent Neural Networks
import torch
import torch.nn as nn
import torch.optim as optim
data = [
(["I", "love", "coding"], ["PRON", "VERB", "NOUN"]),
(["She", "writes", "code"], ["PRON", "VERB", "NOUN"]),
(["They", "play", "football"], ["PRON", "VERB", "NOUN"]),
]
word_set = set(word for sentence, _ in data for word in sentence)
tag_set = set(tag for _, tags in data for tag in tags)
word2idx = {word: i + 1 for i, word in enumerate(word_set)}
word2idx["<PAD>"] = 0
tag2idx = {tag: i for i, tag in enumerate(tag_set)}
idx2tag = {i: tag for tag, i in tag2idx.items()}
EMBEDDING_DIM = 32
HIDDEN_DIM = 64
EPOCHS = 100
def encode_sentence(sentence, tag_seq, max_len):
word_ids = [word2idx[word] for word in sentence]
tag_ids = [tag2idx[tag] for tag in tag_seq]
# Padding
while len(word_ids) < max_len:
word_ids.append(word2idx["<PAD>"])
tag_ids.append(-1)
return word_ids, tag_ids
max_len = max(len(s) for s, _ in data)
X, y = zip(*[encode_sentence(s, t, max_len) for s, t in data])
X = torch.tensor(X)
y = torch.tensor(y)
class RNN_POS(nn.Module):
def __init__(self, vocab_size, tagset_size, emb_dim,
hidden_dim):
super(RNN_POS, self).__init__()
self.embedding = nn.Embedding(vocab_size, emb_dim,
padding_idx=0)
self.rnn = nn.RNN(emb_dim, hidden_dim, batch_first=True)
self.fc = nn.Linear(hidden_dim, tagset_size)
def forward(self, x):
x = self.embedding(x)
output, _ = self.rnn(x)
output = self.fc(output)
return output
model = RNN_POS(len(word2idx), len(tag2idx), EMBEDDING_DIM,
HIDDEN_DIM)
criterion = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = optim.Adam(model.parameters(), lr=0.01)
for epoch in range(EPOCHS):
model.train()
optimizer.zero_grad()
output = model(X) # (batch, seq_len, tagset_size)
output = output.view(-1, output.shape[-1])
y_flat = y.view(-1)
loss = criterion(output, y_flat)
loss.backward()
optimizer.step()
def predict(sentence):
model.eval()
tokens = [word2idx.get(word, 0) for word in sentence]
while len(tokens) < max_len:
tokens.append(0)
input_tensor = torch.tensor([tokens])
with torch.no_grad():
predictions = model(input_tensor)
pred_tags = torch.argmax(predictions, dim=2)[0]
return [idx2tag[idx.item()] for idx in
pred_tags[:len(sentence)]]
test_sentence = ["She", "plays", "football"]
print("Sentence:", test_sentence)
print("Predicted POS:", predict(test_sentence))
OUTPUT:
3.Long Short Term Memory(LSTM’s)
import torch
import torch.nn as nn
import torch.optim as optim
data = [
(["I", "love", "coding"], ["PRON", "VERB", "NOUN"]),
(["She", "writes", "code"], ["PRON", "VERB", "NOUN"]),
(["They", "play", "football"], ["PRON", "VERB", "NOUN"]),
]
word_set = set(word for sentence, _ in data for word in sentence)
tag_set = set(tag for _, tags in data for tag in tags)
word2idx = {word: i + 1 for i, word in enumerate(word_set)} # +1
for padding
word2idx["<PAD>"] = 0
tag2idx = {tag: i for i, tag in enumerate(tag_set)}
idx2tag = {i: tag for tag, i in tag2idx.items()}
EMBEDDING_DIM = 32
HIDDEN_DIM = 64
EPOCHS = 100
def encode_sentence(sentence, tag_seq, max_len):
word_ids = [word2idx[word] for word in sentence]
tag_ids = [tag2idx[tag] for tag in tag_seq]
while len(word_ids) < max_len:
word_ids.append(word2idx["<PAD>"])
tag_ids.append(-1) # Ignore index for padding
return word_ids, tag_ids
max_len = max(len(s) for s, _ in data)
X, y = zip(*[encode_sentence(s, t, max_len) for s, t in data])
X = torch.tensor(X)
y = torch.tensor(y)
class LSTM_POS(nn.Module):
def __init__(self, vocab_size, tagset_size, emb_dim,
hidden_dim):
super(LSTM_POS, self).__init__()
self.embedding = nn.Embedding(vocab_size, emb_dim,
padding_idx=0)
self.lstm = nn.LSTM(emb_dim, hidden_dim,
batch_first=True)
self.fc = nn.Linear(hidden_dim, tagset_size)
def forward(self, x):
x = self.embedding(x)
lstm_out, _ = self.lstm(x)
out = self.fc(lstm_out)
return out
model = LSTM_POS(len(word2idx), len(tag2idx), EMBEDDING_DIM,
HIDDEN_DIM)
criterion = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = optim.Adam(model.parameters(), lr=0.01)
for epoch in range(EPOCHS):
model.train()
optimizer.zero_grad()
output = model(X) # (batch, seq_len, tagset_size)
output = output.view(-1, output.shape[-1])
y_flat = y.view(-1)
loss = criterion(output, y_flat)
loss.backward()
optimizer.step()
def predict(sentence):
model.eval()
tokens = [word2idx.get(word, 0) for word in sentence]
while len(tokens) < max_len:
tokens.append(0)
input_tensor = torch.tensor([tokens])
with torch.no_grad():
predictions = model(input_tensor)
pred_tags = torch.argmax(predictions, dim=2)[0]
return [idx2tag[idx.item()] for idx in
pred_tags[:len(sentence)]]
test_sentence = ["I", "play", "football"]
print("Sentence:", test_sentence)
print("Predicted POS:", predict(test_sentence))
OUTPUT:
4.Any Transformer using encoder
architecture
# 2.4
import torch
import torch.nn as nn
import torch.optim as optim
import math
# Sample data
data = [
(["I", "love", "coding"], ["PRON", "VERB", "NOUN"]),
(["She", "writes", "code"], ["PRON", "VERB", "NOUN"]),
(["They", "play", "football"], ["PRON", "VERB", "NOUN"]),
]
word_set = set(w for sent, _ in data for w in sent)
tag_set = set(tag for _, tags in data for tag in tags)
word2idx = {w: i + 1 for i, w in enumerate(word_set)}
word2idx["<PAD>"] = 0
tag2idx = {t: i for i, t in enumerate(tag_set)}
idx2tag = {i: t for t, i in tag2idx.items()}
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_len=512):
super(PositionalEncoding, self).__init__()
pe = torch.zeros(max_len, d_model).float()
position = torch.arange(0, max_len).unsqueeze(1).float()
div_term = torch.exp(torch.arange(0, d_model, 2).float()
* (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
self.pe = pe.unsqueeze(0)
def forward(self, x):
return x + self.pe[:, :x.size(1)].to(x.device)
def encode(sentence, tags, max_len):
word_ids = [word2idx[w] for w in sentence]
tag_ids = [tag2idx[t] for t in tags]
while len(word_ids) < max_len:
word_ids.append(word2idx["<PAD>"])
tag_ids.append(-1)
return word_ids, tag_ids
max_len = max(len(s) for s, _ in data)
X, y = zip(*[encode(s, t, max_len) for s, t in data])
X = torch.tensor(X)
y = torch.tensor(y)
class TransformerPOSTagger(nn.Module):
def __init__(self, vocab_size, tagset_size, emb_dim=64,
num_heads=2, num_layers=2, ff_dim=128):
super().__init__()
self.embedding = nn.Embedding(vocab_size, emb_dim,
padding_idx=0)
self.positional_encoding = PositionalEncoding(emb_dim)
encoder_layer =
nn.TransformerEncoderLayer(d_model=emb_dim, nhead=num_heads,
dim_feedforward=ff_dim)
self.transformer_encoder =
nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
self.fc = nn.Linear(emb_dim, tagset_size)
def forward(self, x):
x = self.embedding(x) # (batch_size, seq_len, emb_dim)
x = self.positional_encoding(x)
x = x.permute(1, 0, 2) # Transformer expects (seq_len,
batch_size, emb_dim)
x = self.transformer_encoder(x)
x = x.permute(1, 0, 2) # Back to (batch_size, seq_len,
emb_dim)
return self.fc(x)
model = TransformerPOSTagger(len(word2idx), len(tag2idx))
criterion = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in range(100):
model.train()
optimizer.zero_grad()
outputs = model(X)
outputs = outputs.view(-1, outputs.shape[-1])
loss = criterion(outputs, y.view(-1))
loss.backward()
optimizer.step()
if epoch % 10 == 0:
print(f"Epoch {epoch}, Loss: {loss.item():.4f}")
# Inference
def predict(sentence):
model.eval()
tokens = [word2idx.get(w, 0) for w in sentence]
while len(tokens) < max_len:
tokens.append(0)
input_tensor = torch.tensor([tokens])
with torch.no_grad():
out = model(input_tensor)
pred = torch.argmax(out, dim=-1)[0]
return [idx2tag[i.item()] for i in pred[:len(sentence)]]
# Test
test_sentence = ["I", "write", "code"]
print("Sentence:", test_sentence)
print("Predicted POS:", predict(test_sentence))
OUTPUT:
3.Implement Text
classification(sensitive data or normal data) using a linear SVM algorithm based on
a sensitive program from week 7/8.
import nltk
import re
from nltk.corpus import brown
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
from tabulate import tabulate
nltk.download('brown')
nltk.download('punkt')
corpus_sentences = [" ".join(sentence) for sentence in brown.sents()
[:2000]]
sensitive_keywords = {
"personal": ["name", "email", "address", "dob", "birth", "phone",
"gender"],
"financial": ["credit", "debit", "card", "account", "balance",
"bank", "income", "salary"],
"social": ["facebook", "twitter", "instagram", "friends", "social",
"media", "relationship"]
}
category_score = {
"personal": 5,
"financial": 4,
"social": 3
}
def classify_sensitivity(text):
text = text.lower()
max_sensitivity = 0
matched_words = []
for category, keywords in sensitive_keywords.items():
for keyword in keywords:
if keyword in text:
matched_words.append((keyword,
category_score[category]))
max_sensitivity = max(max_sensitivity,
category_score[category])
classification = "Sensitive" if max_sensitivity > 0 else "Normal"
return classification, matched_words
processed_sentences = []
labels = []
matched_keywords = []
for sentence in corpus_sentences:
label, found = classify_sensitivity(sentence)
processed_sentences.append(sentence)
labels.append(label)
matched_keywords.append(found)
sample_data = []
for i in range(5):
sample_data.append({
"Sentence": processed_sentences[i],
"Label": labels[i],
"Sensitive Terms": ", ".join([f"{term}({score})" for term,
score in matched_keywords[i]]) if matched_keywords[i] else "-"
})
sample_df = pd.DataFrame(sample_data)
print("\n📌 Sample Sensitivity Analysis:\n")
print(tabulate(sample_df, headers='keys', tablefmt='grid',
showindex=True))
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_sentences)
y = [1 if label == "Sensitive" else 0 for label in labels] # Convert
to binary labels
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
svm_classifier = LinearSVC()
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)
report = classification_report(y_test, y_pred, target_names=["Normal",
"Sensitive"], output_dict=True)
report_df = pd.DataFrame(report).transpose()
print("\n📊 Model Evaluation Report:\n")
print(tabulate(report_df, headers='keys', tablefmt='grid',
floatfmt=".2f"))
OUTPUT: