EXPERIMENT8
EXPERIMENT8
NLP
Rajarajeshwari
March 30, 2025
1 Introduction
Word embeddings are essential for various NLP tasks, providing dense vector represen-
tations of words that capture semantic and syntactic relationships. This study evaluates
different word embedding models for the Maithili language.
2.4 Transformers
Transformers use self-attention mechanisms to process entire sequences in parallel, leading
to state-of-the-art performance in contextual embeddings.
1
2.6 Term Frequency-Inverse Document Frequency (TF-IDF)
TF-IDF represents words based on their importance within a document, calculated as
the product of term frequency and inverse document frequency.
2.11 FastText
FastText extends Skip-gram by representing words as character n-grams, improving em-
beddings for rare and out-of-vocabulary words.
2
2.15 Bidirectional Encoder Representations from Transformers
(BERT)
BERT uses masked language modeling and next sentence prediction to generate context-
aware word embeddings.
2.18 NV-Embed-v2
NV-Embed-v2 is a neural word embedding approach that optimizes representations using
adversarial training techniques.
2.19 Doc2Vec
Doc2Vec extends Word2Vec to generate embeddings for entire documents rather than
individual words.
2.20 InferSent
InferSent is a sentence embedding model trained for capturing semantic relationships in
sentence pairs.
3 Experimental Setup
We evaluate the above models on Maithili datasets for short and long text embedding.
We apply both intrinsic and extrinsic evaluation metrics.
4 Intrinsic Evaluation
We assess word similarity, word analogy, concept categorization, outlier detection, QVEC,
embedding latency, retrieval quality, Geodesic correlation, triplet loss, Minimum Recon-
struction Error (MRE) score, t-SNE, and PCA visualizations.
5 Extrinsic Evaluation
Extrinsic evaluation includes POS tagging, chunking, named-entity recognition (NER),
sentiment analysis, paraphrase identification, and neural machine translation (NMT).
3
6 Comparative Analysis
We compare the performance of different embeddings, analyzing:
7 Conclusion
Our study provides insights into the effectiveness of different word embedding models for
Maithili NLP, guiding future research directions.
4
import requests
from bs4 import BeautifulSoup
import pandas as pd
# Find all the links to articles (we assume that article links are contained in <a> tags with specific classes)
article_links = []
for article in soup.find_all('a', href=True): # <a> tags with href attributes
link = article['href']
# Filter links for those that lead to articles (you can refine the condition based on the actual page structure)
if link.startswith('https://www.mithilanews.com/') and 'category' not in link:
article_links.append(link)
return article_links
# Find article text (Assuming article text is in <div> with class 'entry-content' or similar)
article_text = ""
content_div = soup.find('div', class_='entry-content') # You may need to adjust this based on the actual site structure
if content_div:
article_text = content_div.get_text(separator=' ', strip=True) # Get text from article
return article_text
# Function to scrape multiple articles and save them into a CSV file
def scrape_mithila_news(base_url, output_file):
article_urls = get_article_urls(base_url)
articles = []
# Save the scraped articles into a pandas DataFrame and then to CSV
df = pd.DataFrame(articles)
df.to_csv(output_file, index=False, encoding='utf-8')
print(f"Scraping complete! Data saved to {output_file}")
Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more
If you want or need to use an HTML parser on this document, you can make this warning go away by filtering it. To do that, run th
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
data=pd.read_csv("maithili_articles.csv")
data['text'].head()
text
dtype: object
import nltk
nltk.download('punkt_tab')
import pandas as pd
import re
nltk
import
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec, FastText
# Sample text column (Update this based on your dataset's text column name)
text_column = "text" # Change to the actual column name if different
data = data.dropna(subset=[text_column]) # Drop missing values
# Text Preprocessing
def preprocess_text(text):
text = re.sub(r'[^ऀ -ॿ ]', '', text) # Keep only Maithili (Devanagari) characters
tokens = word_tokenize(text)
return tokens if tokens else ["empty"] # Prevent empty lists
nltk.download('punkt')
data['tokens'] = data[text_column].apply(preprocess_text)
# 1. TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(analyzer='word', tokenizer=lambda x: x, preprocessor=lambda x: x)
X_tfidf = tfidf_vectorizer.fit_transform(data['tokens'])
# 3. Train FastText
fasttext_model = FastText(sentences=data['tokens'], vector_size=100, window=5, min_count=2, workers=4, sg=1) # Skip-gram FastText
# Example Usage:
print("TF-IDF Shape:", X_tfidf.shape)
print("Word2Vec CBOW Example:", word2vec_cbow.wv.most_similar("मैथिली", topn=5)) # Replace with a valid Maithili word
print("FastText Example:", fasttext_model.wv.most_similar("मैथिली", topn=5)) # Replace with a valid Maithili word
# Intrinsic Evaluation
## Word Similarity
def word_similarity(model, word):
try:
return model.wv.most_similar(word, topn=5)
except KeyError:
return "Word not in vocabulary"
Word Similarity (CBOW): [('आऽ', 0.9916399717330933), ('हिनक', 0.9912693500518799), ('ई', 0.9899354577064514), ('आ', 0.98990559577
Word Similarity (FastText): [('२मैथिली', 0.9999423623085022), ('१मैथिली', 0.9999391436576843), ('१३मैथिली', 0.9998805522918701), ('मैथिलीक
## Word Analogy
def word_analogy(model, positive_words, negative_words):
try:
return model.wv.most_similar(positive=positive_words, negative=negative_words, topn=5)
except KeyError:
return "Words not in vocabulary"
## t-SNE Visualization
def plot_embeddings(model, title):
words = list(model.wv.index_to_key)[:100]
vectors = np.array([model.wv[word] for word in words])
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)
plt.figure(figsize=(10, 6))
plt.scatter(reduced_vectors[:, 0], reduced_vectors[:, 1])
for word, (x, y) in zip(words, reduced_vectors):
plt.text(x, y, word, fontsize=8)
plt.title(title)
plt.show()
import nltk
nltk.download('averaged_perceptron_tagger_eng')
POS Tagging Example: [('मैथिली', 'JJ'), ('भाषा', 'NNP'), ('भारत', 'NNP'), ('में', 'NNP'), ('प्राचीन', 'NNP'), ('भाषाओं', 'NNP'), ('में', 'NNP
# Comparative Investigation
## Similarity Comparisons
word_a = "भाषा"
word_b = "मैथिली"
word_c = "प्रभात"
similarity_ab = word2vec_cbow.wv.similarity(word_a, word_b)
similarity_ac = word2vec_cbow.wv.similarity(word_a, word_c)
print(f"Similarity {word_a}-{word_b}: {similarity_ab}, {word_a}-{word_c}: {similarity_ac}")
print("Is A more similar to B than C?", similarity_ab > similarity_ac)
import pandas as pd
import re
import nltk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec, FastText, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.decomposition import LatentDirichletAllocation
from transformers import BertModel, BertTokenizer
import torch
# Load dataset
data = pd.read_csv("maithili_articles.csv")
# Sample text column (Update this based on your dataset's text column name)
text_column = "text" # Change to the actual column name if different
data = data.dropna(subset=[text_column]) # Drop missing values
# Text Preprocessing
def preprocess_text(text):
text = re.sub(r'[^ऀ -ॿ ]', '', text) # Keep only Telugu characters
sentences = sent_tokenize(text) # Sentence tokenization
tokenized_sentences = [word_tokenize(sent) for sent in sentences]
return tokenized_sentences
nltk.download('punkt')
data['tokenized_sentences'] = data[text_column].apply(preprocess_text)
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import warnings
# Ignore warnings
warnings.filterwarnings('ignore')
def get_bert_embedding(text):
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
return outputs.last_hidden_state.mean(dim=1).numpy()
data['bert_embeddings'] = data[text_column].apply(get_bert_embedding)
# Example Usage
print("TF-IDF Shape:", X_tfidf.shape)
print("LSA Shape:", X_lsa.shape)
print("LDA Shape:", X_lda.shape)
print("Word2Vec CBOW Example:", word2vec_cbow.wv.most_similar("मैथिली", topn=5)) # Replace with a valid Maithili word
print("FastText Example:", fasttext_model.wv.most_similar("मैथिली", topn=5)) # Replace with a valid Maithili word
print("Doc2Vec Example:", doc2vec_model.dv.most_similar([doc2vec_model.dv[0]], topn=5)) # Corrected index handling
print("BERT Embedding Shape:", data['bert_embeddings'][0].shape)
plt.figure(figsize=(10, 6))
plt.scatter(reduced_vectors[:, 0], reduced_vectors[:, 1])
plt.title(title)
plt.show()