1.
)Write a program to tokenize the following given
sentences into the sentence,words. Hello everyone.
Welcome to NITTE (Deemed to be University) NMAMIT. I
AM studying the NLP Elective. Use at least 3 different
methods to perform the same
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize,
regexp_tokenize
nltk.download('punkt_tab')
# Given text to tokenize
text = "Hello everyone. Welcome to NITTE (Deemed to be University)
NMAMIT. I AM studying the NLP Elective."
# Method 1: Splitting sentences in the paragraph using sent_tokenize
print("\nMethod 1: Splitting sentences in the paragraph")
print(text)
print(sent_tokenize(text))
# Method 2: Splitting words in the sentence using word_tokenize
print("\nMethod 2: Splitting words in the sentence")
print(word_tokenize(text))
# Method 3: Tokenizing words using regular expression with
regexp_tokenize
print("\nMethod 3: Tokenizing words using regular expression")
print(regexp_tokenize(text, r"[\w]+"))
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt_tab.zip.
Method 1: Splitting sentences in the paragraph
Hello everyone. Welcome to NITTE (Deemed to be University) NMAMIT. I
AM studying the NLP Elective.
['Hello everyone.', 'Welcome to NITTE (Deemed to be University)
NMAMIT.', 'I AM studying the NLP Elective.']
Method 2: Splitting words in the sentence
['Hello', 'everyone', '.', 'Welcome', 'to', 'NITTE', '(', 'Deemed',
'to', 'be', 'University', ')', 'NMAMIT', '.', 'I', 'AM', 'studying',
'the', 'NLP', 'Elective', '.']
Method 3: Tokenizing words using regular expression
['Hello', 'everyone', 'Welcome', 'to', 'NITTE', 'Deemed', 'to', 'be',
'University', 'NMAMIT', 'I', 'AM', 'studying', 'the', 'NLP',
'Elective']
2.)How does the recursive generate function use a PCFG
defined in a Python dictionary to select weighted
production rules and expand the starting symbol 'S' into
a complete sentence?
import random
# Define a simple PCFG grammar.
# Each key is a non-terminal symbol with a list of tuples.
# Each tuple contains a production rule (as a list of symbols) and its
probability.
grammar = {
"S": [(["NP", "VP"], 1.0)], # Sentence -> Noun Phrase + Verb
Phrase
"NP": [
(["Det", "N"], 0.8), # Noun Phrase -> Determiner + Noun
(["Name"], 0.2) # Noun Phrase -> Proper Name
],
"VP": [
(["V", "NP"], 0.5), # Verb Phrase -> Verb + Noun Phrase
(["V"], 0.5) # Verb Phrase -> Verb
],
"Det": [
(["the"], 0.5),
(["a"], 0.5)
],
"N": [
(["cat"], 0.5),
(["dog"], 0.5)
],
"Name": [
(["Alice"], 1.0)
],
"V": [
(["sees"], 1.0)
]
}
def generate(symbol):
"""
Recursively generates a sentence fragment from the given symbol
using the PCFG grammar.
Parameters:
symbol (str): The non-terminal or terminal symbol to expand.
Returns:
str: The generated string from the grammar.
"""
# If the symbol is not in the grammar, it's assumed to be a
terminal.
if symbol not in grammar:
return symbol
productions = grammar[symbol]
# Unzip the production rules and their corresponding weights.
rules, weights = zip(*productions)
# Choose one production rule based on the probabilities.
chosen_rule = random.choices(rules, weights=weights, k=1)[0]
# Debug log: show the chosen production rule for the current non-
terminal.
print(f"Expanding '{symbol}' using rule: {chosen_rule}")
# Recursively generate the string for each symbol in the chosen
rule.
result = [generate(sym) for sym in chosen_rule]
return " ".join(result)
# Generate a sentence starting from the initial symbol 'S'
sentence = generate("S")
print("\nGenerated Sentence:", sentence)
Expanding 'S' using rule: ['NP', 'VP']
Expanding 'NP' using rule: ['Name']
Expanding 'Name' using rule: ['Alice']
Expanding 'VP' using rule: ['V', 'NP']
Expanding 'V' using rule: ['sees']
Expanding 'NP' using rule: ['Det', 'N']
Expanding 'Det' using rule: ['a']
Expanding 'N' using rule: ['dog']
Generated Sentence: Alice sees a dog
3.)Build a trigram model using the Reuters corpus to
predict the next word based on two preceding words?
# Import necessary libraries
import nltk
from nltk import bigrams, trigrams
from nltk.corpus import reuters
from collections import defaultdict
# Download necessary NLTK resources
nltk.download('reuters')
nltk.download('punkt')
nltk.download('punkt_tab')
# Tokenize the text
words = nltk.word_tokenize(' '.join(reuters.words()))
# Create trigrams
tri_grams = list(trigrams(words))
# Build a trigram model
model = defaultdict(lambda: defaultdict(lambda: 0))
# Count frequency of co-occurrence
for w1, w2, w3 in tri_grams:
model[(w1, w2)][w3] += 1
# Transform the counts into probabilities
for w1_w2 in model:
total_count = float(sum(model[w1_w2].values()))
for w3 in model[w1_w2]:
model[w1_w2][w3] /= total_count
# Function to predict the next word
def predict_next_word(w1, w2):
"""
Predicts the next word based on the previous two words using the
trained trigram model.
Args:
w1 (str): The first word.
w2 (str): The second word.
Returns:
str: The predicted next word.
"""
next_word = model[w1, w2]
if next_word:
predicted_word = max(next_word, key=next_word.get) # Choose
the most likely next word
return predicted_word
else:
return "No prediction available"
# Example usage
print("Next Word:", predict_next_word('the', 'stock'))
[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data] Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data] Package punkt_tab is already up-to-date!
Next Word: of
4.)Using Python's re module, extract US phone numbers,
USNs (format LLLNNLLDDD), and email addresses from
a given string.
text_to_search = "Reach us at 800-555-1212 or help@company.com.
Student ID: NNM21EC099."
import re # Import the regular expression module
# 1. The concise text we want to search within
text_to_search = "Reach us at 800-555-1212 or help@company.com.
Student ID: NNM21EC099."
# 2. Define the regular expression patterns (same as before)
# Phone Number Pattern
phone_pattern = r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}"
# USN Number Pattern (LLLNNLLDDD)
usn_pattern = r"[A-Z]{3}\d{2}[A-Z]{2}\d{3}"
# Email Address Pattern
email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
# 3. Find all matches for each pattern
found_phones = re.findall(phone_pattern, text_to_search)
found_usns = re.findall(usn_pattern, text_to_search)
found_emails = re.findall(email_pattern, text_to_search)
# 4. Print the results
print(f"--- Original Text ---")
print(text_to_search)
print("-" * 20) # Separator
print(f"\n--- Found Phone Numbers (Pattern: {phone_pattern}) ---")
if found_phones:
for phone in found_phones:
print(f"- {phone}")
else:
print("No phone numbers found matching the pattern.")
print(f"\n--- Found USN Numbers (Pattern: {usn_pattern}) ---")
if found_usns:
for usn in found_usns:
print(f"- {usn}")
else:
print("No USN numbers found matching the pattern.")
print(f"\n--- Found Email Addresses (Pattern: {email_pattern}) ---")
if found_emails:
for email in found_emails:
print(f"- {email}")
else:
print("No email addresses found matching the pattern.")
--- Original Text ---
Reach us at 800-555-1212 or help@company.com. Student ID: NNM21EC099.
--------------------
--- Found Phone Numbers (Pattern: \(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4})
---
- 800-555-1212
--- Found USN Numbers (Pattern: [A-Z]{3}\d{2}[A-Z]{2}\d{3}) ---
- NNM21EC099
--- Found Email Addresses (Pattern: [a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.
[a-zA-Z]{2,}) ---
- help@company.com
5.) What do the cost parameters (ins_cost, del_cost,
sub_cost) control in this edit distance function?
def weighted_edit_distance_no_numpy(s1, s2, ins_cost=1, del_cost=1,
sub_cost=1):
m = len(s1)
n = len(s2)
# Initialize DP table with nested lists
dp = [[0.0 for _ in range(n + 1)] for _ in range(m + 1)]
# --- Initialization ---
for j in range(n + 1):
dp[0][j] = j * ins_cost
for i in range(m + 1):
dp[i][0] = i * del_cost
# --- Fill DP table ---
for i in range(1, m + 1):
for j in range(1, n + 1):
current_sub_cost = 0 if s1[i - 1] == s2[j - 1] else
sub_cost
deletion = dp[i - 1][j] + del_cost
insertion = dp[i][j - 1] + ins_cost
substitution = dp[i - 1][j - 1] + current_sub_cost
dp[i][j] = min(deletion, insertion, substitution)
return dp[m][n]
# Example usage
string1 = "intention"
string2 = "execution"
distance1 = weighted_edit_distance_no_numpy(string1, string2,
ins_cost=1, del_cost=1, sub_cost=1)
print(f"Weighted edit distance between '{string1}' and '{string2}':
{distance1}")
Weighted edit distance between 'intention' and 'execution': 5