Method 2: Convert to Data Frame
Convert the tibble to a regular data frame, which doesn't have the truncation feature.
R
Copy code
# Convert to a data frame
data_frame <- as.data.frame(data)
# View the data frame
View(data_frame) # Opens in RStudio's data viewer
Method 3: Save to a File
If the data is too large to display in the console, you can save it to
a file for easier viewing.
# Save to a CSV file
write.csv(data, file = "output.csv", row.names = FALSE)
# Save to another Excel file (requires the openxlsx or writexl
package)
install.packages("writexl")
library(writexl)
write_xlsx(data, path = "output.xlsx")
Method 4: Use RStudio's Data Viewer
If you're using RStudio, the View() function is ideal for visually
inspecting the data.
# Open the data in RStudio's data viewer
View(data)
Note on Large Data
If the sheet has too many rows to fit in the console, consider using
filtering or summarisation before printing. For example:
R
# Display the first 20 rows
head(data, 20)
# Display specific rows (e.g., rows 50 to 70)
data[50:70, ]
You can effectively inspect and handle all the data in your Excel
sheet using these methods!
FINAL CODE FOR UNIT 1:
# Install required packages (if not installed yet)
install.packages("readxl")
install.packages("NLP")
install.packages("tm")
install.packages("stringr")
install.packages("wordcloud")
install.packages("openNLP")
install.packages("SnowballC")
install.packages("topicmodels")
# Load necessary libraries
library(readxl)
library(NLP)
library(tm)
library(stringr)
library(wordcloud)
library(openNLP)
library(SnowballC)
library(topicmodels)
# Load the dataset
file_path <- file.choose()
textdata <- read_excel(file_path)
head(textdata)
print(textdata, n = Inf)
# Preprocess data (convert to lowercase and split by lines)
data1 <- tolower(textdata)
linesdata <- unlist(strsplit(data1, split = "\\n"))
str(linesdata)
# Remove non-alphabetic characters and digits, then strip extra
whitespaces
linesdata1 <- gsub(pattern = "\\W", replacement = " ", linesdata)
linesdata2 <- gsub(pattern = "\\d", replacement = " ", linesdata1)
linesdata2 <- stripWhitespace(linesdata2)
# Combine all lines into a single string and split into words (Word
Bag)
single.vector <- paste(linesdata2, collapse = " ")
wordofbags <- str_split(single.vector, pattern = "\\s+")
wordofbags <- unlist(wordofbags)
stopwords()
# Remove stopwords from the text
finaldata <- removeWords(wordofbags, stopwords())
finaldata
wordofbag_clean <- Filter(nzchar, finaldata)
# Word cloud generation
cloud(wordofbag_clean, min.freq = 2, colors = "darkblue")
# Perform Stemming (converting words to their root form)
word_stemmed <- wordStem(wordofbag_clean)
print("Stemming Output:")
print(word_stemmed)
# Tokenization: Split text into individual tokens (words)
word_tokens <- word_tokenize(paste(linesdata, collapse = " "))
print("Tokenization Output:")
print(word_tokens)
# Part-of-Speech (POS) Tagging: Tag each word with its part of speech
(Noun, Verb, etc.)
sentence <- as.String(paste(linesdata, collapse = " "))
word_annotation <- Maxent_Word_Tagger()
pos_tags <- annotate(sentence, word_annotation)
print("POS Tagging Output:")
print(pos_tags)
# Syntactical Parsing: Parse the sentence structure (dependency
parsing)
parse_model <- Maxent_Parser()
parsed_sentence <- parse(sentence)
print("Syntactical Parsing Output:")
print(parsed_sentence)
# Shallow Parsing (Chunking): Identify noun and verb phrases
chunk_model <- Maxent_Chunk_Parser()
chunked_sentence <- chunk(sentence)
print("Shallow Parsing (Chunking) Output:")
print(chunked_sentence)
# Topic Modeling using LDA (Latent Dirichlet Allocation)
dtm <- DocumentTermMatrix(Corpus(VectorSource(linesdata)))
lda_model <- LDA(dtm, k = 3) # Assuming we want 3 topics
topics <- topics(lda_model)
print("Topic Modeling Output:")
print(topics)
# Sentiment analysis using custom lexicons (positive and negative
words)
positive.words <- scan("/path/to/Positive.rtf", what = "character",
comment.char = '\n')
negative.words <- scan("/path/to/Negative.rtf", what = "character",
comment.char = '\n')
positive.sentimenbag <- match(wordofbag_clean, positive.words)
negative.sentimenbag <- match(wordofbag_clean, negative.words)
sum.of.positive.sentiment <- sum(!is.na(positive.sentimenbag))
sum.of.negative.sentiment <- sum(!is.na(negative.sentimenbag))
sentiment.score <- sum.of.positive.sentiment -
sum.of.negative.sentiment
# Display sentiment score
print("Sentiment Score:")
print(sentiment.score)
Key Differences Between Stemming and Lemmatization
Aspect Stemming Lemmatization
Approach Algorithmic (rules-based) Dictionary-based (lexical)
Speed Faster Slower
Output May not be a real word Always a valid word
Accuracy Less accurate More accurate
Example "studies" → "studi" "studies" → "study"
Part-of-Speech (POS) Tagging, Shallow Parsing, and Syntactic Parsing:
Basics and Applications
1. Part-of-Speech (POS) Tagging
Definition: POS tagging involves identifying the grammatical role of each word in a sentence
(e.g., noun, verb, adjective, etc.).
Basic Example:
Given the sentence:
"The quick brown fox jumps over the lazy dog."
POS tagging assigns labels like:
● The → Determiner
● quick → Adjective
● fox → Noun
● jumps → Verb
Code Example in R using the udpipe package:
R
Copy code
# Install required package
install.packages("udpipe")
library(udpipe)
# Load a pre-trained model (English model)
model <- udpipe_download_model(language = "english")
ud_model <- udpipe_load_model(model$file_model)
# Sample text
text <- "The quick brown fox jumps over the lazy dog."
# POS tagging
output <- udpipe_annotate(ud_model, x = text)
output_df <- as.data.frame(output)
print(output_df[, c("token", "upos")]) # Display word and POS tags
Applications in Real Life:
● Grammarly/Auto-correct: Identifying verb forms, noun-verb agreement.
● Search Engines: Highlighting keywords and intent classification.
● Text Summarization/Question Answering: Tagging verbs and nouns to focus on
critical information.
2. Shallow Parsing (Chunking)
Definition: Shallow parsing, or chunking, groups words into chunks (like noun phrases or verb
phrases) without going into deep grammatical structure.
E.g.,
● Sentence: "The quick brown fox jumps over the lazy dog."
Noun Phrase (NP): The quick brown fox
Verb Phrase (VP): jumps over
Prepositional Phrase (PP): the lazy dog
Code Example in R:
R
Copy code
# Shallow parsing using the same `udpipe` output
output_df$chunk <- paste(output_df$lemma, collapse = " ")
print(output_df[, c("token", "upos", "head_token_id")]) # Simplified
chunking
Applications in Real Life:
● Named Entity Recognition (NER): Detecting entities like names, places, dates.
● Information Extraction: Extracting "chunks" of valuable information like product
features or reviews.
● Grammarly: Suggesting rewrites or completing partial phrases.
3. Syntactic Parsing (Deep Parsing)
Definition: Syntactic parsing delves into the grammatical structure of a sentence to represent
relationships between words using a tree structure.
For example:
● Sentence: "The cat chased the mouse."
Structure:
○ Root: chased
■ Subject: The cat
■ Object: the mouse
Code Example in R:
R
Copy code
# Extract syntactic dependencies
syntax_tree <- output_df[, c("token", "head_token_id", "dep_rel")]
print(syntax_tree) # Dependency relationships
Applications in Real Life:
● Grammarly/Auto-correct: Detecting sentence fragments or misplaced modifiers.
● Voice Assistants (Alexa/Google Assistant): Understanding complex commands.
● Chatbots: Structuring user inputs into actionable intents.
● Machine Translation: Generating accurate translations based on sentence structure.
Workflow Example Combining All Three
1. Input Sentence: "The small child reads a book."
2. POS Tagging: Identifies words and their roles (e.g., small → adjective, reads →
verb).
3. Shallow Parsing: Extracts phrases (e.g., Noun Phrase: The small child).
4. Syntactic Parsing: Builds a structure showing that child is the subject of reads.
Advanced Insights
1. Error Detection and Correction:
○ POS Tagging: Helps identify missing articles (a, the) or incorrect tense usage.
○ Parsing: Detects syntactic errors, e.g., misplaced modifiers or fragments.
2. Applications in Real-life Systems:
○ Grammarly/Auto-correct: Uses POS tagging for word-level checks and
syntactic parsing for sentence-level issues.
○ Search Engine Optimization: POS tagging highlights keywords, while parsing
helps detect relationships.
○ Machine Learning Models: Input features derived from these techniques (e.g.,
parse trees, tagged words).
What is udpipe?
udpipe (Universal Dependencies Pipeline) is an R package that provides tools for tokenizing,
POS tagging, lemmatization, and dependency parsing of text. It uses Universal Dependencies
(UD), which is a framework for consistent annotation of grammar across different languages.
The package supports multiple pre-trained language models, allowing users to perform linguistic
annotations for text in various languages.
Why Use udpipe?
1. Versatility: Handles multiple languages with pre-trained models.
2. Comprehensive Annotation: Outputs tokens, POS tags, lemmas, and syntactic
dependencies.
3. Ease of Use: Provides a complete text annotation pipeline within R.
4. Application Areas: Useful for text mining, natural language processing (NLP), and
linguistic analysis.
Explaining the Syntax
Let's break down the syntax used in the code examples:
1. Installing and Loading the Package
R
Copy code
install.packages("udpipe")
library(udpipe)
● Installs and loads the udpipe package.
2. Downloading a Pre-Trained Model
R
Copy code
model <- udpipe_download_model(language = "english")
ud_model <- udpipe_load_model(model$file_model)
● udpipe_download_model(language = "english"): Downloads a pre-trained
model for the specified language (e.g., English). The downloaded model includes rules
for tokenization, POS tagging, lemmatization, and dependency parsing.
● udpipe_load_model(model$file_model): Loads the downloaded model into
memory for further processing.
3. Annotating Text
R
Copy code
output <- udpipe_annotate(ud_model, x = text)
output_df <- as.data.frame(output)
● udpipe_annotate(): Performs tokenization, POS tagging, lemmatization, and
syntactic parsing on the input text (x = text) using the loaded model.
○ Input: Raw text (e.g., "The quick brown fox jumps over the lazy
dog.")
○ Output: A structured annotation, which includes tokens, lemmas, POS tags, and
syntactic dependencies.
● as.data.frame(): Converts the output into a data frame for easier manipulation and
visualization in R.
4. Accessing Specific Annotations
R
Copy code
print(output_df[, c("token", "upos")])
● output_df: The annotated data in tabular form.
○ Columns:
■ token: Each word/token in the text.
■ upos: The Universal Part-of-Speech tag (e.g., NOUN, VERB, ADJ).
Sample Output:
token upos
The DET
quick ADJ
brown ADJ
fox NOUN
jumps VERB
5. Exploring Syntactic Parsing
R
Copy code
syntax_tree <- output_df[, c("token", "head_token_id", "dep_rel")]
● head_token_id: Identifies the "head" word for each token, based on syntactic
relationships.
● dep_rel: Indicates the dependency relationship (e.g., nsubj for nominal subject, obj
for object).
Sample Output:
token head_token_i dep_re
d l
The 2 det
quick 2 amod
fox 3 nsubj
jumps 0 root
Advanced Features of udpipe
1. Named Entity Recognition (NER): Detects entities like names, dates, and locations.
Add columns like entity to identify such entities.
2. Customization: Use custom models for domain-specific languages (e.g., medical or
legal).
3. Visualization: Use external libraries (like igraph) to plot syntactic dependency trees.
1. Finding Implicit Features
Implicit features refer to attributes or aspects of an entity that are not explicitly mentioned but
can be inferred from the context. For example, in "The battery lasts all day," the feature "battery
life" is implicit.
Approach: Co-occurrence Analysis
1. Identify Opinion Words (e.g., adjectives/adverbs).
2. Find their associated nouns or entities in the context.
Example in R
R
Copy code
# Install and load necessary libraries
install.packages("text")
install.packages("tidytext")
library(text)
library(tidytext)
# Sample text
text <- c("The camera is amazing. The battery lasts all day. I love
the sleek design.")
# Tokenize and annotate part-of-speech tags
install.packages("udpipe")
library(udpipe)
model <- udpipe_download_model(language = "english")
ud_model <- udpipe_load_model(model$file_model)
annotated <- udpipe_annotate(ud_model, x = text)
annotated_df <- as.data.frame(annotated)
# Filter nouns and associated adjectives
features <- annotated_df[annotated_df$upos %in% c("NOUN") &
annotated_df$dep_rel %in% c("amod",
"nsubj"), ]
features
Output:
token upos dep_re head_token_i
l d
camer NOUN nsubj amazing
a
battery NOUN nsubj lasts
design NOUN obj love
This output reveals implicit features like "battery" inferred from the verb "lasts" and "design" from
"sleek."
2. Finding Opinion Phrases
Opinion phrases describe sentiments or evaluations. These phrases often consist of:
1. Adjectives: "Amazing camera."
2. Adverbs + Adjectives: "Really sleek design."
3. Verb Phrases: "I love the design."
Example in R
R
Copy code
# Extract opinion phrases (adjectives, adverbs, verbs)
opinions <- annotated_df[annotated_df$upos %in% c("ADJ", "ADV",
"VERB"), ]
opinion_phrases <- merge(opinions, features, by = "head_token_id",
suffixes = c("_opinion", "_feature"))
# Display opinion phrases
opinion_phrases[, c("token_opinion", "token_feature")]
Output:
token_opinio token_featur
n e
amazing camera
lasts battery
sleek design
3. Context-Specific Word Semantic Orientation
Semantic orientation refers to the polarity (positive/negative/neutral) of a word in a specific
context. For instance, "cheap" can be positive (affordable) or negative (low quality) depending
on the context.
Approach: Use Sentiment Lexicons
You can calculate semantic orientation by:
1. Matching words to sentiment lexicons (e.g., Bing, NRC).
2. Aggregating scores for each context.
Example in R
R
Copy code
# Install and load sentiment lexicon
install.packages("tidytext")
library(tidytext)
# Use the Bing lexicon
bing <- get_sentiments("bing")
# Tokenize text
tokens <- tibble(text = text) %>%
unnest_tokens(word, text)
# Join tokens with the lexicon
semantic_orientation <- tokens %>%
inner_join(bing, by = c("word" = "word"))
# Calculate context-specific sentiment
context_sentiment <- semantic_orientation %>%
count(sentiment, sort = TRUE)
context_sentiment
Output:
sentimen n
t
positive 4
negative 1
4. Analysis of Words with R
Perform detailed word analysis, including:
1. Frequency Analysis
2. Word Co-occurrence
3. Word Clouds
Example: Frequency Analysis
R
Copy code
# Frequency analysis
word_counts <- tokens %>%
count(word, sort = TRUE)
word_counts
Output:
word n
the 4
is 2
sleek 1
Example: Word Co-occurrence
Find words that frequently occur together in the same context.
R
Copy code
# Find word pairs (bigrams)
install.packages("widyr")
library(widyr)
word_pairs <- tokens %>%
pairwise_count(word, text, sort = TRUE)
word_pairs
Output:
item1 item2 n
the camer 1
a
battery lasts 1
Example: Word Cloud
R
Copy code
# Install and load wordcloud
install.packages("wordcloud")
library(wordcloud)
# Create a word cloud
wordcloud(words = word_counts$word, freq = word_counts$n, max.words =
50)
Real-Life Applications
1. Customer Feedback Analysis:
○ Identify implicit features (e.g., "battery life") and associated sentiments.
2. Product Reviews:
○ Extract opinion phrases to improve product descriptions.
3. Social Media Monitoring:
○ Perform semantic orientation for brand perception analysis.
4. Market Research:
○ Analyze co-occurrence of words to identify trending topics.
5. Chatbots and Voice Assistants:
○ Understand user intents and associate context-specific meanings.
Using the tm Package
Step 1: Install and Load Libraries
R
Copy code
install.packages("tm")
library(tm)
Step 2: Create a Text Corpus
R
Copy code
# Sample documents
documents <- c(
"The camera is amazing. The battery lasts all day.",
"I love the sleek design and lightweight body.",
"The screen resolution is fantastic but the battery drains quickly."
)
# Create a corpus
corpus <- Corpus(VectorSource(documents))
Step 3: Preprocess the Text
● Convert text to lowercase.
● Remove punctuation, stopwords, and extra whitespace.
R
Copy code
# Preprocess the corpus
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)
Step 4: Create the Term-Document Matrix
R
Copy code
# Generate Term-Document Matrix
tdm <- TermDocumentMatrix(corpus)
# Convert TDM to a matrix for easier visualization
tdm_matrix <- as.matrix(tdm)
print(tdm_matrix)
Output (TDM Matrix):
Doc1 Doc2 Doc3
amazin 1 0 0
g
battery 1 0 1
body 0 1 0
camera 1 0 0
drains 0 0 1
Step 5: Calculate Term Frequencies
R
Copy code
# Term Frequency
term_frequency <- rowSums(tdm_matrix)
term_frequency <- sort(term_frequency, decreasing = TRUE)
# View term frequencies
print(term_frequency)
Output:
Term Frequenc
y
battery 2
amazin 1
g
camera 1
body 1
drains 1
Using the tidytext Package
The tidytext package works with text data in a tidy format, making it easy to use with other
tidyverse libraries.
Step 1: Install and Load Libraries
R
Copy code
install.packages("tidytext")
install.packages("dplyr")
library(tidytext)
library(dplyr)
Step 2: Create a Tidy Dataset
R
Copy code
# Convert the text to a tidy tibble
text_data <- tibble(document = paste0("Doc", 1:length(documents)),
text = documents)
Step 3: Tokenize the Text
Tokenize the text into individual words and remove stopwords.
R
Copy code
# Tokenize and remove stopwords
tokens <- text_data %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
print(tokens)
Output:
documen word
t
Doc1 camera
Doc1 amazin
g
Doc1 battery
Doc1 lasts
Doc1 day
Step 4: Calculate Term Frequencies
R
Copy code
# Calculate term frequencies
term_frequency <- tokens %>%
count(word, sort = TRUE)
print(term_frequency)
Output:
word n
battery 2
amazin 1
g
camera 1
day 1
design 1
Step 5: Create a Term-Document Matrix
R
Copy code
# Create a Term-Document Matrix
tdm <- tokens %>%
count(document, word) %>%
cast_dtm(document, word, n)
# View the TDM
print(as.matrix(tdm))
Output (TDM Matrix):
Doc1 Doc2 Doc3
amazin 1 0 0
g
battery 1 0 1
body 0 1 0
camera 1 0 0
drains 0 0 1
Applications of TDM and Term Frequency
1. Customer Feedback Analysis:
○ Identify frequently mentioned issues or positive aspects of products.
2. Sentiment Analysis:
○ Use term frequencies as features to classify sentiment.
3. Topic Modeling:
○ Combine TDM with algorithms like Latent Dirichlet Allocation (LDA) to identify
hidden topics in text.
4. Spam Detection:
○ Use word frequencies to identify spam keywords.
5. Search Engine Optimization:
○ Analyze frequently used words to optimize web content.
Scenario
We have a simple sentence:
"The fast camera captures sharp images."
Our goal:
1. Subsequence Kernels: Focus on extracting relationships using linguistic
dependencies (e.g., "fast modifies camera").
2. String Kernels: Focus on pattern similarity or matching subsequences directly from
the text (e.g., identifying common word patterns like "adjective-noun pairs").
1. Subsequence Kernels
Focus
● Relies on dependency parsing to find structured relationships.
● Example: Adjective modifies a noun, or a subject is associated with a verb.
Implementation in R
R
CopyEdit
library(udpipe)
library(dplyr)
# Example sentence
text <- "The fast camera captures sharp images."
# Load English udpipe model
ud_model <- udpipe_download_model(language = "english") # Download if
not already
ud_model <- udpipe_load_model(ud_model$file_model)
# Annotate the sentence
annotations <- udpipe_annotate(ud_model, x = text)
annotations_df <- as.data.frame(annotations)
# Extract structured relationships (adjective-noun, subject-verb,
etc.)
subsequence_relations <- annotations_df %>%
filter(dep_rel %in% c("amod", "nsubj", "obj")) %>% # Adjective
modifier, subject, object
select(token, head_token, dep_rel)
print(subsequence_relations)
Output
token head_toke dep_re
n l
fast camera amod
camer captures nsubj
a
sharp images amod
Interpretation:
● "fast" modifies "camera" (amod).
● "camera" is the subject of "captures" (nsubj).
● "sharp" modifies "images" (amod).
2. String Kernels
Focus
● Directly works on text to identify patterns without relying on parsing.
● Example: Find adjective-noun pairs or common subsequences.
Implementation in R
R
CopyEdit
# Extract pairs using pattern matching
string_patterns <- annotations_df %>%
filter(upos == "ADJ") %>% # Find adjectives
mutate(pair = paste(token, head_token, sep = " ")) %>% # Pair with
head noun
select(pair)
print(string_patterns)
Output
pair
fast camera
sharp
images
Interpretation:
● "fast camera" and "sharp images" are identified as adjective-noun pairs.
Key Differences
Aspect Subsequence Kernels String Kernels
Approach Uses dependency parsing for Uses raw text or parsed text for
structured extraction. pattern matching.
Output Focuses on grammatical Focuses on string patterns (e.g.,
relationships (e.g., amod). adjective + noun).
Example "fast modifies camera" (amod). "fast camera" (adjective-noun pair).
Relation
Use Case Relation extraction in NLP tasks. Pattern detection or similarity
measurement.
When to Use Each
● Use Subsequence Kernels if you need grammatical relationships like
subject-verb-object or modifier relations.
● Use String Kernels for pattern detection or when dependency parsing is not available.
1. TF-IDF (Term Frequency-Inverse Document Frequency)
Definition
● TF-IDF measures the importance of a word in a document relative to a collection of
documents (corpus).
○ TF: Frequency of a term in a document.
○ IDF: Penalizes terms that are frequent across all documents, giving more weight
to unique terms.
Example
Suppose you have two product reviews:
1. "The camera is excellent and the battery lasts long."
2. "Battery life is short but the camera quality is excellent."
Words like "camera" and "battery" are common across both reviews, so their IDF scores will be
lower compared to unique words like "long" or "quality."
Code Implementation (Using udpipe)
library(udpipe)
library(dplyr)
library(tidytext)
# Example data
reviews <- data.frame(doc_id = c("doc1", "doc2"),
text = c("The camera is excellent and the
battery lasts long.",
"Battery life is short but the camera
quality is excellent."))
# Load a model
ud_model <- udpipe_download_model(language = "english") # Download if
needed
ud_model <- udpipe_load_model(ud_model$file_model)
# Annotate text
annotations <- udpipe_annotate(ud_model, x = reviews$text, doc_id =
reviews$doc_id)
annotations <- as.data.frame(annotations)
# Compute TF-IDF
tf_idf_data <- annotations %>%
count(doc_id, lemma) %>%
bind_tf_idf(term = lemma, document = doc_id, n = n) %>%
arrange(desc(tf_idf))
print(tf_idf_data)
2. Zipf's Law
Definition
● Zipf’s Law states that in a large corpus, the frequency of a word is inversely proportional
to its rank.
○ The most frequent word appears twice as often as the second most frequent
word, three times as often as the third, and so on.
Example
For our reviews, the most frequent word might be "camera," followed by "battery." If "camera"
occurs 8 times and "battery" occurs 4 times, the distribution roughly follows Zipf's Law.
Code Implementation
R
CopyEdit
# Word frequency distribution
word_freq <- annotations %>%
count(lemma, sort = TRUE)
# Visualize Zipf's Law
library(ggplot2)
ggplot(word_freq, aes(rank = row_number(), freq = n)) +
geom_line() +
scale_x_log10() +
scale_y_log10() +
labs(title = "Zipf's Law: Word Frequency Distribution",
x = "Rank (log scale)", y = "Frequency (log scale)")
3. bind_tf_idf Function
Definition
● The bind_tf_idf function from the tidytext package calculates the TF-IDF scores
for terms in a corpus.
● The code is given above