[go: up one dir, main page]

0% found this document useful (0 votes)
42 views13 pages

Natural Language Processing Lab 9

The document outlines a lab assignment focused on Natural Language Processing, requiring the implementation of text pre-processing on the Brown and Gutenberg corpora. It includes tasks such as tokenization, counting unique and rare words, and stop-word identification, followed by text summarization, POS tagging, and NER modeling using various neural network architectures like Feed Forward, RNN, LSTM, and Transformers. The document provides code snippets for each task, detailing the necessary libraries and methods for processing the text data.

Uploaded by

ragebhanukiran
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
42 views13 pages

Natural Language Processing Lab 9

The document outlines a lab assignment focused on Natural Language Processing, requiring the implementation of text pre-processing on the Brown and Gutenberg corpora. It includes tasks such as tokenization, counting unique and rare words, and stop-word identification, followed by text summarization, POS tagging, and NER modeling using various neural network architectures like Feed Forward, RNN, LSTM, and Transformers. The document provides code snippets for each task, detailing the necessary libraries and methods for processing the text data.

Uploaded by

ragebhanukiran
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 13

Natural Language Processing

Lab Assignment

R.BhanuKiran
22BCE9560
L45+L46
1. Implement Text pre-processing on Brown corpus &
Gutenberg corpus and display the list of tokens(count), list
of sentences(count), count of paragraphs, list & count of
unique words, list & count of rare words, list & count of
stop-words separately for each corpus.
2. import nltk
3. from nltk.corpus import brown, gutenberg, stopwords
4. from nltk.tokenize import word_tokenize, sent_tokenize
5. from collections import Counter
6. import string
7. nltk.download('punkt_tab')
8. nltk.download('brown')
9. nltk.download('gutenberg')
10. nltk.download('punkt')
11. nltk.download('stopwords')
12. stop_words = set(stopwords.words('english'))
13. punctuations = set(string.punctuation)
14. def preprocess_and_analyze_corpus(corpus,
corpus_name="Corpus"):
15. print(f"\n=== 📘 Analyzing {corpus_name} ===")
16. if corpus_name == "Brown":
17. raw_text = " ".join(brown.words())
18. fileids = brown.fileids()
19. elif corpus_name == "Gutenberg":
20. raw_text = " ".join(gutenberg.words())
21. fileids = gutenberg.fileids()
22. else:
23. return
24. tokens = word_tokenize(raw_text)
25. sents = sent_tokenize(raw_text)
26. cleaned_tokens = [
27. token.lower() for token in tokens
28. if token.lower() not in stop_words and token not in
punctuations and token.isalpha()
29. ]b
30. token_counts = Counter(cleaned_tokens)
31. unique_words = list(token_counts.keys())
32. rare_words = [word for word, count in
token_counts.items() if count == 1]
33. stopword_list = [token for token in tokens if
token.lower() in stop_words]
34. print(f"Total Tokens: {len(tokens)}")
35. print(f"Sample Tokens: {tokens[:10]}")
36. print(f"Total Sentences: {len(sents)}")
37. print(f"Sample Sentences: {sents[:2]}")
38. print(f"Total Paragraphs (FileIDs): {len(fileids)}")
39. print(f"FileIDs (Used as Paragraphs): {fileids[:5]}")
40. print(f"Unique Words Count: {len(unique_words)}")
41. print(f"Unique Words Sample: {unique_words[:10]}")
42. print(f"Rare Words Count: {len(rare_words)}")
43. print(f"Rare Words Sample: {rare_words[:10]}")
44. print(f"Stopwords Count: {len(stopword_list)}")
45. print(f"Stopwords Sample: {stopword_list[:10]}")
46. print("\n" + "-"*60)
47. preprocess_and_analyze_corpus(brown, corpus_name="Brown")
48. preprocess_and_analyze_corpus(gutenberg,
corpus_name="Gutenberg")
49.
OUTPUT:

2. Perform Text summarization, POS


Tagging and NER modelling on the post-processed corpus by the following neural
networks.

 1.Feed forward Neural network


 import torch
 import torch.nn as nn
 import torch.optim as optim
 from collections import OrderedDict

 data = [
 (["I", "love", "coding"], ["PRON", "VERB", "NOUN"]),
 (["She", "writes", "code"], ["PRON", "VERB", "NOUN"]),
 (["They", "play", "football"], ["PRON", "VERB", "NOUN"]),
 ]

 words = sorted(set(word for sent, _ in data for word in sent))
 tags = sorted(set(tag for _, t in data for tag in t))

 word2idx = {word: i for i, word in enumerate(words)}
 tag2idx = {tag: i for i, tag in enumerate(tags)}
 idx2tag = {i: tag for tag, i in tag2idx.items()}

 X = []
 y = []

 for sent, tag_seq in data:
 for word, tag in zip(sent, tag_seq):
 X.append(word2idx[word])
 y.append(tag2idx[tag])

 X = torch.tensor(X)
 y = torch.tensor(y)

 class FFNN_POS(nn.Module):
 def __init__(self, vocab_size, tagset_size, emb_dim=32,
hidden_dim=64):
 super(FFNN_POS, self).__init__()
 self.embedding = nn.Embedding(vocab_size, emb_dim)
 self.fc1 = nn.Linear(emb_dim, hidden_dim)
 self.relu = nn.ReLU()
 self.fc2 = nn.Linear(hidden_dim, tagset_size)

 def forward(self, x):
 x = self.embedding(x)
 x = self.relu(self.fc1(x))
 x = self.fc2(x)
 return x

 model = FFNN_POS(len(word2idx), len(tag2idx))
 criterion = nn.CrossEntropyLoss()
 optimizer = optim.Adam(model.parameters(), lr=0.01)

 # Training
 for epoch in range(100):
 optimizer.zero_grad()
 outputs = model(X)
 loss = criterion(outputs, y)
 loss.backward()
 optimizer.step()

 # Testing
 model.eval()
 with torch.no_grad():
 test_word = "love"
 test_input = torch.tensor([word2idx[test_word]]) # shape:
[1]
 pred = model(test_input)
 predicted_tag = idx2tag[torch.argmax(pred).item()]
 print(f"Prediction for '{test_word}': {predicted_tag}")

OUTPUT:

 2.Recurrent Neural Networks


 import torch
 import torch.nn as nn
 import torch.optim as optim

 data = [
 (["I", "love", "coding"], ["PRON", "VERB", "NOUN"]),
 (["She", "writes", "code"], ["PRON", "VERB", "NOUN"]),
 (["They", "play", "football"], ["PRON", "VERB", "NOUN"]),
 ]

 word_set = set(word for sentence, _ in data for word in sentence)
 tag_set = set(tag for _, tags in data for tag in tags)

 word2idx = {word: i + 1 for i, word in enumerate(word_set)}
 word2idx["<PAD>"] = 0
 tag2idx = {tag: i for i, tag in enumerate(tag_set)}
 idx2tag = {i: tag for tag, i in tag2idx.items()}

 EMBEDDING_DIM = 32
 HIDDEN_DIM = 64
 EPOCHS = 100

 def encode_sentence(sentence, tag_seq, max_len):
 word_ids = [word2idx[word] for word in sentence]
 tag_ids = [tag2idx[tag] for tag in tag_seq]

 # Padding
 while len(word_ids) < max_len:
 word_ids.append(word2idx["<PAD>"])
 tag_ids.append(-1)

 return word_ids, tag_ids

 max_len = max(len(s) for s, _ in data)
 X, y = zip(*[encode_sentence(s, t, max_len) for s, t in data])
 X = torch.tensor(X)
 y = torch.tensor(y)

 class RNN_POS(nn.Module):
 def __init__(self, vocab_size, tagset_size, emb_dim,
hidden_dim):
 super(RNN_POS, self).__init__()
 self.embedding = nn.Embedding(vocab_size, emb_dim,
padding_idx=0)
 self.rnn = nn.RNN(emb_dim, hidden_dim, batch_first=True)
 self.fc = nn.Linear(hidden_dim, tagset_size)

 def forward(self, x):
 x = self.embedding(x)
 output, _ = self.rnn(x)
 output = self.fc(output)
 return output

 model = RNN_POS(len(word2idx), len(tag2idx), EMBEDDING_DIM,
HIDDEN_DIM)
 criterion = nn.CrossEntropyLoss(ignore_index=-1)
 optimizer = optim.Adam(model.parameters(), lr=0.01)

 for epoch in range(EPOCHS):
 model.train()
 optimizer.zero_grad()
 output = model(X) # (batch, seq_len, tagset_size)
 output = output.view(-1, output.shape[-1])
 y_flat = y.view(-1)
 loss = criterion(output, y_flat)
 loss.backward()
 optimizer.step()

 def predict(sentence):
 model.eval()
 tokens = [word2idx.get(word, 0) for word in sentence]
 while len(tokens) < max_len:
 tokens.append(0)
 input_tensor = torch.tensor([tokens])
 with torch.no_grad():
 predictions = model(input_tensor)
 pred_tags = torch.argmax(predictions, dim=2)[0]
 return [idx2tag[idx.item()] for idx in
pred_tags[:len(sentence)]]

 test_sentence = ["She", "plays", "football"]
 print("Sentence:", test_sentence)
 print("Predicted POS:", predict(test_sentence))

OUTPUT:
 3.Long Short Term Memory(LSTM’s)
 import torch
 import torch.nn as nn
 import torch.optim as optim

 data = [
 (["I", "love", "coding"], ["PRON", "VERB", "NOUN"]),
 (["She", "writes", "code"], ["PRON", "VERB", "NOUN"]),
 (["They", "play", "football"], ["PRON", "VERB", "NOUN"]),
 ]

 word_set = set(word for sentence, _ in data for word in sentence)
 tag_set = set(tag for _, tags in data for tag in tags)

 word2idx = {word: i + 1 for i, word in enumerate(word_set)} # +1
for padding
 word2idx["<PAD>"] = 0
 tag2idx = {tag: i for i, tag in enumerate(tag_set)}
 idx2tag = {i: tag for tag, i in tag2idx.items()}

 EMBEDDING_DIM = 32
 HIDDEN_DIM = 64
 EPOCHS = 100

 def encode_sentence(sentence, tag_seq, max_len):
 word_ids = [word2idx[word] for word in sentence]
 tag_ids = [tag2idx[tag] for tag in tag_seq]

 while len(word_ids) < max_len:
 word_ids.append(word2idx["<PAD>"])
 tag_ids.append(-1) # Ignore index for padding

 return word_ids, tag_ids

 max_len = max(len(s) for s, _ in data)
 X, y = zip(*[encode_sentence(s, t, max_len) for s, t in data])
 X = torch.tensor(X)
 y = torch.tensor(y)

 class LSTM_POS(nn.Module):
 def __init__(self, vocab_size, tagset_size, emb_dim,
hidden_dim):
 super(LSTM_POS, self).__init__()
 self.embedding = nn.Embedding(vocab_size, emb_dim,
padding_idx=0)
 self.lstm = nn.LSTM(emb_dim, hidden_dim,
batch_first=True)
 self.fc = nn.Linear(hidden_dim, tagset_size)

 def forward(self, x):
 x = self.embedding(x)
 lstm_out, _ = self.lstm(x)
 out = self.fc(lstm_out)
 return out

 model = LSTM_POS(len(word2idx), len(tag2idx), EMBEDDING_DIM,
HIDDEN_DIM)
 criterion = nn.CrossEntropyLoss(ignore_index=-1)
 optimizer = optim.Adam(model.parameters(), lr=0.01)

 for epoch in range(EPOCHS):
 model.train()
 optimizer.zero_grad()
 output = model(X) # (batch, seq_len, tagset_size)
 output = output.view(-1, output.shape[-1])
 y_flat = y.view(-1)
 loss = criterion(output, y_flat)
 loss.backward()
 optimizer.step()

 def predict(sentence):
 model.eval()
 tokens = [word2idx.get(word, 0) for word in sentence]
 while len(tokens) < max_len:
 tokens.append(0)
 input_tensor = torch.tensor([tokens])
 with torch.no_grad():
 predictions = model(input_tensor)
 pred_tags = torch.argmax(predictions, dim=2)[0]
 return [idx2tag[idx.item()] for idx in
pred_tags[:len(sentence)]]

 test_sentence = ["I", "play", "football"]
 print("Sentence:", test_sentence)
 print("Predicted POS:", predict(test_sentence))

OUTPUT:

 4.Any Transformer using encoder


architecture
 # 2.4
 import torch
 import torch.nn as nn
 import torch.optim as optim
 import math

 # Sample data
 data = [
 (["I", "love", "coding"], ["PRON", "VERB", "NOUN"]),
 (["She", "writes", "code"], ["PRON", "VERB", "NOUN"]),
 (["They", "play", "football"], ["PRON", "VERB", "NOUN"]),
 ]

 word_set = set(w for sent, _ in data for w in sent)
 tag_set = set(tag for _, tags in data for tag in tags)

 word2idx = {w: i + 1 for i, w in enumerate(word_set)}
 word2idx["<PAD>"] = 0
 tag2idx = {t: i for i, t in enumerate(tag_set)}
 idx2tag = {i: t for t, i in tag2idx.items()}

 class PositionalEncoding(nn.Module):
 def __init__(self, d_model, max_len=512):
 super(PositionalEncoding, self).__init__()
 pe = torch.zeros(max_len, d_model).float()
 position = torch.arange(0, max_len).unsqueeze(1).float()
 div_term = torch.exp(torch.arange(0, d_model, 2).float()
* (-math.log(10000.0) / d_model))
 pe[:, 0::2] = torch.sin(position * div_term)
 pe[:, 1::2] = torch.cos(position * div_term)
 self.pe = pe.unsqueeze(0)

 def forward(self, x):
 return x + self.pe[:, :x.size(1)].to(x.device)

 def encode(sentence, tags, max_len):
 word_ids = [word2idx[w] for w in sentence]
 tag_ids = [tag2idx[t] for t in tags]
 while len(word_ids) < max_len:
 word_ids.append(word2idx["<PAD>"])
 tag_ids.append(-1)
 return word_ids, tag_ids

 max_len = max(len(s) for s, _ in data)
 X, y = zip(*[encode(s, t, max_len) for s, t in data])
 X = torch.tensor(X)
 y = torch.tensor(y)

 class TransformerPOSTagger(nn.Module):
 def __init__(self, vocab_size, tagset_size, emb_dim=64,
num_heads=2, num_layers=2, ff_dim=128):
 super().__init__()
 self.embedding = nn.Embedding(vocab_size, emb_dim,
padding_idx=0)
 self.positional_encoding = PositionalEncoding(emb_dim)
 encoder_layer =
nn.TransformerEncoderLayer(d_model=emb_dim, nhead=num_heads,
dim_feedforward=ff_dim)
 self.transformer_encoder =
nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
 self.fc = nn.Linear(emb_dim, tagset_size)

 def forward(self, x):
 x = self.embedding(x) # (batch_size, seq_len, emb_dim)
 x = self.positional_encoding(x)
 x = x.permute(1, 0, 2) # Transformer expects (seq_len,
batch_size, emb_dim)
 x = self.transformer_encoder(x)
 x = x.permute(1, 0, 2) # Back to (batch_size, seq_len,
emb_dim)
 return self.fc(x)

 model = TransformerPOSTagger(len(word2idx), len(tag2idx))
 criterion = nn.CrossEntropyLoss(ignore_index=-1)
 optimizer = optim.Adam(model.parameters(), lr=0.001)

 for epoch in range(100):
 model.train()
 optimizer.zero_grad()
 outputs = model(X)
 outputs = outputs.view(-1, outputs.shape[-1])
 loss = criterion(outputs, y.view(-1))
 loss.backward()
 optimizer.step()
 if epoch % 10 == 0:
 print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

 # Inference
 def predict(sentence):
 model.eval()
 tokens = [word2idx.get(w, 0) for w in sentence]
 while len(tokens) < max_len:
 tokens.append(0)
 input_tensor = torch.tensor([tokens])
 with torch.no_grad():
 out = model(input_tensor)
 pred = torch.argmax(out, dim=-1)[0]
 return [idx2tag[i.item()] for i in pred[:len(sentence)]]

 # Test
 test_sentence = ["I", "write", "code"]
 print("Sentence:", test_sentence)
 print("Predicted POS:", predict(test_sentence))

OUTPUT:

3.Implement Text
classification(sensitive data or normal data) using a linear SVM algorithm based on
a sensitive program from week 7/8.

import nltk
import re
from nltk.corpus import brown
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
from tabulate import tabulate

nltk.download('brown')
nltk.download('punkt')

corpus_sentences = [" ".join(sentence) for sentence in brown.sents()


[:2000]]

sensitive_keywords = {
"personal": ["name", "email", "address", "dob", "birth", "phone",
"gender"],
"financial": ["credit", "debit", "card", "account", "balance",
"bank", "income", "salary"],
"social": ["facebook", "twitter", "instagram", "friends", "social",
"media", "relationship"]
}

category_score = {
"personal": 5,
"financial": 4,
"social": 3
}

def classify_sensitivity(text):
text = text.lower()
max_sensitivity = 0
matched_words = []

for category, keywords in sensitive_keywords.items():


for keyword in keywords:
if keyword in text:
matched_words.append((keyword,
category_score[category]))
max_sensitivity = max(max_sensitivity,
category_score[category])

classification = "Sensitive" if max_sensitivity > 0 else "Normal"


return classification, matched_words

processed_sentences = []
labels = []
matched_keywords = []

for sentence in corpus_sentences:


label, found = classify_sensitivity(sentence)
processed_sentences.append(sentence)
labels.append(label)
matched_keywords.append(found)

sample_data = []
for i in range(5):
sample_data.append({
"Sentence": processed_sentences[i],
"Label": labels[i],
"Sensitive Terms": ", ".join([f"{term}({score})" for term,
score in matched_keywords[i]]) if matched_keywords[i] else "-"
})

sample_df = pd.DataFrame(sample_data)
print("\n📌 Sample Sensitivity Analysis:\n")
print(tabulate(sample_df, headers='keys', tablefmt='grid',
showindex=True))

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_sentences)
y = [1 if label == "Sensitive" else 0 for label in labels] # Convert
to binary labels

X_train, X_test, y_train, y_test = train_test_split(X, y,


test_size=0.2, random_state=42)

svm_classifier = LinearSVC()
svm_classifier.fit(X_train, y_train)

y_pred = svm_classifier.predict(X_test)
report = classification_report(y_test, y_pred, target_names=["Normal",
"Sensitive"], output_dict=True)
report_df = pd.DataFrame(report).transpose()

print("\n📊 Model Evaluation Report:\n")


print(tabulate(report_df, headers='keys', tablefmt='grid',
floatfmt=".2f"))

OUTPUT:

You might also like