0% found this document useful (0 votes)

14 views11 pages

Assignment 4

This document outlines a programming assignment for a course on Information Retrieval, detailing the implementation of an inverted index using Python. It includes functions for processing text documents, removing stop words, stemming tokens, and calculating term frequencies and TF-IDF values. The assignment also involves setting up a SQLite database to store term and document information, along with statistics on the processed documents.

Uploaded by

velvet-had-garment

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

14 views11 pages

Assignment 4

Uploaded by

velvet-had-garment

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 11

Programming Assignment

University of The People

CS 3308-01: Information Retrieval

Instructor: Sharina Babb

16th July 2025

import string
import sys
import os
import re
import math
import sqlite3
import time
from typing import Dict, Set, List
from collections import defaultdict, Counter
from nltk.stem import PorterStemmer

# Define stop words

stop_words = set(["a", "an", "the", "and", "or", "but", "is", "are",
"was", "were", "in", "of", "to", "with"])

# Compile regex patterns for efficiency

chars = re.compile(r'\W+')
pattid = re.compile(r'(\d{3})/(\d{3})/(\d{3})')

# Global counters for corpus statistics

tokens = 0
documents = 0
terms = 0
stop_word_count = 0

# Database to store term information

database: Dict[str, 'Term'] = {}

class Term:
"""
Class to represent term information in the index
Stores term frequency, document frequency, and posting
information
"""

def __init__(self):
self.termid: int = 0
self.termfreq: int = 0
self.docs: int = 0
self.docids: Dict[int, int] = {}

def splitchars(line: str) -> List[str]:

"""
Split input text into tokens based on non-word characters
Args:
line: Input text string
Returns:
List of tokens
"""
return chars.split(line)

def remove_stop_words(tokens: List[str]) -> List[str]:

global stop_word_count
filtered_tokens = [token for token in tokens if token not in
stop_words]
stop_word_count += len(tokens) - len(filtered_tokens)
return filtered_tokens

ps = PorterStemmer()

def stem_tokens(tokens: List[str]) -> List[str]:

return [ps.stem(token) for token in tokens]

def remove_punctuation_tokens(tokens: List[str]) -> List[str]:

return [token for token in tokens if token and token[0] not in
string.punctuation]

def parsetoken(line: str) -> List[str]:

"""
Process a line of text to extract and index terms
Args:
line: Input text line
Returns:
List of processed tokens
"""
global documents, tokens, terms

# Normalize input text

line = line.replace('\t', ' ').strip()

# Split into tokens

token_list = splitchars(line)
token_list = remove_stop_words(token_list)
token_list = stem_tokens(token_list)
token_list = remove_punctuation_tokens(token_list)

for token in token_list:

# Clean and normalize token
token = token.replace('\n', '')
lower_token = token.lower().strip()

if not lower_token: # Skip empty tokens

continue

tokens += 1 # Increment total token count

# Add new term to database if not exists

if lower_token not in database:
terms += 1
database[lower_token] = Term()
database[lower_token].termid = terms
database[lower_token].docids = {}
database[lower_token].docs = 0

# Update posting information

if documents not in database[lower_token].docids:
database[lower_token].docs += 1
database[lower_token].docids[documents] = 0

# Update term frequency

database[lower_token].docids[documents] += 1
database[lower_token].termfreq += 1
return token_list

def process(filename: str) -> bool:

"""
Process a single document file
Args:
filename: Path to document file
Returns:
Boolean indicating success
"""
try:
# print(f"Reading file: {filename}")
with open(filename, 'r', encoding='utf-8') as file:
for line in file:
parsetoken(line)
return True
except IOError as e:
print(f"Error processing file {filename}: {str(e)}")
return False
except UnicodeDecodeError:
print(f"Unicode decode error in file {filename}")
return False

def walkdir(cur: sqlite3.Cursor, dirname: str) -> bool:

"""
Recursively walk through directory and process all files
Args:
cur: Database cursor
dirname: Directory path
Returns:
Boolean indicating success
"""
global documents

try:
# Get all files and directories
all_items = [f for f in os.listdir(dirname)
if os.path.isdir(os.path.join(dirname, f))
or os.path.isfile(os.path.join(dirname, f))]

for item in all_items:

full_path = os.path.join(dirname, item)
if os.path.isdir(full_path):
print(f"Entering directory: {full_path}")
walkdir(cur, full_path)
else:
# print(f"Processing file: {full_path}")
documents += 1
# Add document to dictionary
cur.execute("INSERT INTO DocumentDictionary VALUES
(?, ?)",
(full_path, documents))
if not process(full_path):
print(f"Failed to process file: {full_path}")
return True

except Exception as e:
print(f"Error walking directory {dirname}: {str(e)}")
return False

def setup_database(cursor: sqlite3.Cursor):

"""
Set up database tables and indexes
Args:
cursor: Database cursor
"""
# Document Dictionary
cursor.execute("DROP TABLE IF EXISTS DocumentDictionary")

# Term Dictionary
cursor.execute("DROP TABLE IF EXISTS TermDictionary")

# Posting Table
cursor.execute("DROP TABLE IF EXISTS Posting")

# Create new tables

cursor.execute("""
CREATE TABLE IF NOT EXISTS DocumentDictionary (
DocumentName TEXT,
DocId INTEGER PRIMARY KEY
)
""")

cursor.execute("""
CREATE TABLE IF NOT EXISTS TermDictionary (
Term TEXT,
TermId INTEGER PRIMARY KEY
)
""")

cursor.execute("""
CREATE TABLE IF NOT EXISTS Posting (
TermId INTEGER,
DocId INTEGER,
tfidf REAL,
docfreq INTEGER,
termfreq INTEGER,
FOREIGN KEY(TermId) REFERENCES TermDictionary(TermId),
FOREIGN KEY(DocId) REFERENCES
DocumentDictionary(DocId)
)
""")

# Create indexes
cursor.execute("CREATE INDEX IF NOT EXISTS idx_term ON
TermDictionary(Term)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_posting_term
ON Posting(TermId)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_posting_doc ON
Posting(DocId)")

def calculate_frequencies():
term_frequencies = []
document_frequencies = defaultdict(int)

for term, term_obj in database.items():

tf = Counter(term_obj.docids)
term_frequencies.append(tf)

for doc_id in tf:

document_frequencies[term] += 1

return term_frequencies, document_frequencies

def calculate_idf(document_frequencies, total_docs):

idf = {}
for term, df in document_frequencies.items():
idf[term] = math.log(total_docs / df)
return idf

def calculate_tf_idf(term_frequencies, idf):

tf_idf = []

for tf in term_frequencies:
tf_idf_doc = {}
for term, freq in tf.items():
tf_idf_doc[term] = freq * idf.get(term, 0)
tf_idf.append(tf_idf_doc)

return tf_idf

class InvertedIndex:
def __init__(self):
self.index = defaultdict(list)

def add_document(self, doc_id, tf_idf):

for term, weight in tf_idf.items():
self.index[term].append((doc_id, weight))

def report_statistics():
total_terms = sum(term.termfreq for term in database.values())
unique_terms = len(database)
print(f"Number of documents processed: {documents}")
print(f"Total number of terms parsed from all documents:
{tokens}")
print(f"Total number of unique terms found and added to the
index: {unique_terms}")
print(f"Total number of terms found that matched one of the stop
words: {stop_word_count}")

def main():
"""
Main execution function
"""
# Record start time
start_time = time.localtime()
print(f"Start Time: {start_time.tm_hour:02d}:
{start_time.tm_min:02d}")

# Initialize database
db_path = "cacm_index.db"
conn = sqlite3.connect(db_path)
conn.isolation_level = None # Enable autocommit
cursor = conn.cursor()

# Setup database tables

setup_database(cursor)

# Process corpus
corpus_path = "./cacm" # Update this path to match your
environment
if not os.path.exists(corpus_path):
print(f"Error: Corpus directory not found at {corpus_path}")
return

walkdir(cursor, corpus_path)

# Calculate tf-idf for each term in each document

term_frequencies, document_frequencies =
calculate_frequencies()
idf = calculate_idf(document_frequencies, documents)
tf_idf = calculate_tf_idf(term_frequencies, idf)

# Insert terms into database

for term, term_obj in database.items():
cursor.execute("INSERT INTO TermDictionary (Term, TermId)
VALUES (?, ?)",
(term, term_obj.termid))

# Insert posting information

for doc_id, freq in term_obj.docids.items():
tfidf = freq * math.log(documents / term_obj.docs)
cursor.execute("""
INSERT INTO Posting
(TermId, DocId, tfidf, docfreq, termfreq)
VALUES (?, ?, ?, ?, ?)
""", (term_obj.termid, doc_id, tfidf, term_obj.docs, freq))

# Commit changes and close connection

conn.commit()
conn.close()

# Print statistics
report_statistics()

end_time = time.localtime()
print(f"\nEnd Time: {end_time.tm_hour:02d}:
{end_time.tm_min:02d}")

if __name__ == '__main__':
main()

statistics
References:

Manning, C.D., Raghaven, P., & Schütze, H. (2009). An Introduction to

Information Retrieval (Online ed.). Cambridge, MA: Cambridge
University Press. Available at

http://nlp.stanford.edu/IR-book/information-retrieval-book.html

CS 3308 Programming Assignment Unit 4
No ratings yet
CS 3308 Programming Assignment Unit 4
7 pages
CS 3308 Programming Assignment Unit 2
No ratings yet
CS 3308 Programming Assignment Unit 2
10 pages
Cs 3308 Unit 7 Programming Assignment
No ratings yet
Cs 3308 Unit 7 Programming Assignment
8 pages
Unit 4 Source Code
No ratings yet
Unit 4 Source Code
11 pages
Information Retrieval WA
No ratings yet
Information Retrieval WA
9 pages
IR Assignment4
No ratings yet
IR Assignment4
5 pages
Programming Assignment Unit 05 - CS 3308 - Information Retrieval - University of The People
No ratings yet
Programming Assignment Unit 05 - CS 3308 - Information Retrieval - University of The People
9 pages
Rescued Document
No ratings yet
Rescued Document
4 pages
Inverted Index-Unit-3
No ratings yet
Inverted Index-Unit-3
11 pages
IR Journal 21054
No ratings yet
IR Journal 21054
30 pages
Assignment 4
No ratings yet
Assignment 4
13 pages
Assignment 2
No ratings yet
Assignment 2
4 pages
20BCE1779 - Web Mining - Lab-1
No ratings yet
20BCE1779 - Web Mining - Lab-1
9 pages
CS 3308 Programming Assignment 2
No ratings yet
CS 3308 Programming Assignment 2
3 pages
IR
No ratings yet
IR
12 pages
Ballerono Cappuchino
No ratings yet
Ballerono Cappuchino
10 pages
Document Indexing & Retrieval Guide
No ratings yet
Document Indexing & Retrieval Guide
20 pages
Python Code Examples
100% (1)
Python Code Examples
30 pages
Assignment 2 IR
No ratings yet
Assignment 2 IR
6 pages
SLIP's fsemMCA
No ratings yet
SLIP's fsemMCA
19 pages
Python Dictionary Functions Explained
No ratings yet
Python Dictionary Functions Explained
29 pages
AI Assignment: Asad Nasir - 37 Muhammad Usman Ali - 29 Momin - 49
No ratings yet
AI Assignment: Asad Nasir - 37 Muhammad Usman Ali - 29 Momin - 49
7 pages
Python Cheatsheet
No ratings yet
Python Cheatsheet
3 pages
Pre 1 Cs Ans
No ratings yet
Pre 1 Cs Ans
9 pages
IR Practical
No ratings yet
IR Practical
24 pages
IR Practical 1
No ratings yet
IR Practical 1
5 pages
DSA Paractical by Me
No ratings yet
DSA Paractical by Me
24 pages
Python Record Manual
No ratings yet
Python Record Manual
18 pages
Ir Lab 2 Ir Learning Outcomes: Pyterrier
No ratings yet
Ir Lab 2 Ir Learning Outcomes: Pyterrier
7 pages
Vanessaa Wim
No ratings yet
Vanessaa Wim
9 pages
Assessment - 2: - K Mary Nikitha
No ratings yet
Assessment - 2: - K Mary Nikitha
27 pages
Cs Project Documentation-Merged
No ratings yet
Cs Project Documentation-Merged
19 pages
Excel Database Management
No ratings yet
Excel Database Management
3 pages
Written Assignmen Unit Four IR
No ratings yet
Written Assignmen Unit Four IR
3 pages
IR - 754 All Practical
No ratings yet
IR - 754 All Practical
21 pages
Text File Operations
No ratings yet
Text File Operations
8 pages
A-163 Bhagyashree Patil Exp11
No ratings yet
A-163 Bhagyashree Patil Exp11
3 pages
Do68 Rahulsinha Experiment11
No ratings yet
Do68 Rahulsinha Experiment11
3 pages
Lab - Activity-Iii: ST ND
No ratings yet
Lab - Activity-Iii: ST ND
9 pages
IP Project Saleha
No ratings yet
IP Project Saleha
34 pages
7 Python Report Eddd
No ratings yet
7 Python Report Eddd
16 pages
Ai&Ml Bai601 NLP Lab Manual
No ratings yet
Ai&Ml Bai601 NLP Lab Manual
48 pages
Assignment
No ratings yet
Assignment
14 pages
Web Mining DA
No ratings yet
Web Mining DA
13 pages
Python Assignment PDF
No ratings yet
Python Assignment PDF
6 pages
PYTHONa 7
No ratings yet
PYTHONa 7
15 pages
Ir Op 6
No ratings yet
Ir Op 6
2 pages
Python Cheat Sheet - The Basics Coursera
No ratings yet
Python Cheat Sheet - The Basics Coursera
2 pages
Make It Better and Do What Needs To Be Done and Fi...
No ratings yet
Make It Better and Do What Needs To Be Done and Fi...
32 pages
Arnam Program File
No ratings yet
Arnam Program File
15 pages
Python Cheat Sheet - The Basics Coursera
No ratings yet
Python Cheat Sheet - The Basics Coursera
2 pages
CS Practical - 2
No ratings yet
CS Practical - 2
19 pages
Lab3 IR BIM
No ratings yet
Lab3 IR BIM
14 pages
Index: SR. NO. Practical Name Date of Perform NO. Sign
No ratings yet
Index: SR. NO. Practical Name Date of Perform NO. Sign
23 pages
Zihad Projeject
No ratings yet
Zihad Projeject
20 pages
Python Imp 001
No ratings yet
Python Imp 001
16 pages
Vishal Project - Docx 2
No ratings yet
Vishal Project - Docx 2
37 pages
Ir Journal
No ratings yet
Ir Journal
41 pages
Learning Journal
No ratings yet
Learning Journal
4 pages
Mobile Applications Written Assignment Unit 6
No ratings yet
Mobile Applications Written Assignment Unit 6
13 pages
Written Assignment
No ratings yet
Written Assignment
4 pages
Learning Journal
No ratings yet
Learning Journal
4 pages
Written Assignment
No ratings yet
Written Assignment
8 pages
Learning Journal Unit 4
No ratings yet
Learning Journal Unit 4
5 pages
2016年11月18日的英语角
No ratings yet
2016年11月18日的英语角
2 pages
Data Mining A Conceptual Overview
No ratings yet
Data Mining A Conceptual Overview
32 pages
Computer Networks Performance and Qualit CS4404 TExt
No ratings yet
Computer Networks Performance and Qualit CS4404 TExt
587 pages
Contact Number E-Mail Id ' Vacancies Company Name: Zifo RND Solutions
No ratings yet
Contact Number E-Mail Id ' Vacancies Company Name: Zifo RND Solutions
4 pages
Zhongwu Note
No ratings yet
Zhongwu Note
15 pages
Unit 6 CFA Whole Numbers NBT 5 6 OA 2
No ratings yet
Unit 6 CFA Whole Numbers NBT 5 6 OA 2
3 pages
Model Exam Question Set 1
No ratings yet
Model Exam Question Set 1
2 pages
Ce 412
100% (3)
Ce 412
99 pages
Fluid Mex
No ratings yet
Fluid Mex
29 pages
Nota Fiscal TecTools para Tecnocret
No ratings yet
Nota Fiscal TecTools para Tecnocret
1 page
Teacher
No ratings yet
Teacher
28 pages
2 Labor Law Case Doctrines - Justice Marvic Leonen
No ratings yet
2 Labor Law Case Doctrines - Justice Marvic Leonen
21 pages
5 Days in Roma
No ratings yet
5 Days in Roma
6 pages
AML Rev-8
No ratings yet
AML Rev-8
140 pages
Screen Flash Batch File: or You Can
No ratings yet
Screen Flash Batch File: or You Can
12 pages
GE 8 Photocopiables Unit 4 13
No ratings yet
GE 8 Photocopiables Unit 4 13
2 pages
Einstein Travel Directions June 2014 Gateways
No ratings yet
Einstein Travel Directions June 2014 Gateways
5 pages
Lesson 1 - 085553
No ratings yet
Lesson 1 - 085553
42 pages
Volvo Manual
No ratings yet
Volvo Manual
141 pages
Cable Tray Drawings
0% (1)
Cable Tray Drawings
3 pages
TIP - 1 - 1 - 0-14 - Updater OMU - 1
No ratings yet
TIP - 1 - 1 - 0-14 - Updater OMU - 1
8 pages
Ideathon Template
No ratings yet
Ideathon Template
12 pages
CCIM Module 3 Study Guide
100% (1)
CCIM Module 3 Study Guide
8 pages
Data Analysis and ETL Tools in Business
No ratings yet
Data Analysis and ETL Tools in Business
6 pages
Wijaya 2020 IOP Conf. Ser. Mater. Sci. Eng. 835 012053
No ratings yet
Wijaya 2020 IOP Conf. Ser. Mater. Sci. Eng. 835 012053
13 pages
Role of Urban Cooperative Banks in Financial Inclusion
No ratings yet
Role of Urban Cooperative Banks in Financial Inclusion
6 pages
Unit 1
No ratings yet
Unit 1
82 pages
Peanut Optical Sorter Guide
No ratings yet
Peanut Optical Sorter Guide
8 pages
IDD-213GD OBD Device Specs
No ratings yet
IDD-213GD OBD Device Specs
3 pages
Copy - of - Form - Timesheet - Siemens - Rizki - Z - SI - Pulomas - Maret 2024
No ratings yet
Copy - of - Form - Timesheet - Siemens - Rizki - Z - SI - Pulomas - Maret 2024
2 pages
One Central Feasibility Study 2025
No ratings yet
One Central Feasibility Study 2025
76 pages
Microsoft Course for Eng Students
No ratings yet
Microsoft Course for Eng Students
6 pages
Newsletter - Sept 27, 2012
No ratings yet
Newsletter - Sept 27, 2012
5 pages

Assignment 4

Uploaded by

Assignment 4

Uploaded by

Programming Assignment

University of The People

CS 3308-01: Information Retrieval

Instructor: Sharina Babb

16th July 2025

# Define stop words

# Compile regex patterns for efficiency

# Global counters for corpus statistics

# Database to store term information

def splitchars(line: str) -> List[str]:

def remove_stop_words(tokens: List[str]) -> List[str]:

def stem_tokens(tokens: List[str]) -> List[str]:

def remove_punctuation_tokens(tokens: List[str]) -> List[str]:

def parsetoken(line: str) -> List[str]:

# Normalize input text

# Split into tokens

for token in token_list:

if not lower_token: # Skip empty tokens

tokens += 1 # Increment total token count

# Add new term to database if not exists

# Update posting information

# Update term frequency

def process(filename: str) -> bool:

def walkdir(cur: sqlite3.Cursor, dirname: str) -> bool:

for item in all_items:

def setup_database(cursor: sqlite3.Cursor):

# Create new tables

for term, term_obj in database.items():

for doc_id in tf:

return term_frequencies, document_frequencies

def calculate_idf(document_frequencies, total_docs):

def calculate_tf_idf(term_frequencies, idf):

def add_document(self, doc_id, tf_idf):

# Setup database tables

# Calculate tf-idf for each term in each document

# Insert terms into database

# Insert posting information

# Commit changes and close connection

Manning, C.D., Raghaven, P., & Schütze, H. (2009). An Introduction to

You might also like