0% found this document useful (0 votes)

73 views11 pages

Sahil Malhotra 16 BCE 0113 Web Mining L51+L52: 1. Universal Crawling 1.1. CODE

1. The document discusses various techniques for web mining including universal crawling, cosine similarity calculations, and term frequency-inverse document frequency (TF-IDF). 2. For universal crawling, it presents code to parse HTML links and recursively crawl websites to a given depth, saving pages to text files. 3. To calculate cosine similarity between a query and documents, it extracts TF-IDF features and computes cosine similarity scores to find the most similar documents to a query.

Uploaded by

sahil

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

73 views11 pages

Sahil Malhotra 16 BCE 0113 Web Mining L51+L52: 1. Universal Crawling 1.1. CODE

Uploaded by

sahil

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 11

SAHIL MALHOTRA

16 BCE 0113
WEB MINING L51+L52
1. UNIVERSAL CRAWLING

1.1. CODE

from html.parser import HTMLParser

from urllib.request import urlopen
from urllib import parse
import json

# We are going to create a class called LinkParser that inherits some

# methods from HTMLParser which is why it is passed into the definition

class LinkParser(HTMLParser):

# This is a function that HTMLParser normally has

# but we are adding some functionality to it
def handle_starttag(self, tag, attrs):
# We are looking for the begining of a link. Links normally look
# like <a href="www.someurl.com"></a>
#print ('Entered the handler!')

if tag == 'a':
for (key, value) in attrs:
if key == 'href':
# We are grabbing the new URL. We are also adding the
# base URL to it. For example:
# www.netinstructions.com is the base and
# somepage.html is the new URL (a relative URL)
#
# We combine a relative URL with the base URL to create
# an absolute URL like:
# www.netinstructions.com/somepage.html
newUrl = parse.urljoin(self.baseUrl, value)
# And add it to our colection of links:
#self.links = self.links + [newUrl]
self.links.append(newUrl)

# This is a new function that we are creating to get links

# that our spider() function will call
def getLinks(self, url, numberVisited):

self.links = []
# Remember the base URL which will be important when creating
# absolute URLs

self.baseUrl = url

# Use the urlopen function from the standard Python 3 library

response = urlopen(url)

#print (self.links)
x = response.read()

#print (response.getheader('Content-Type'))
#handle_starttag(self.links, tag, attrs)
#encoding = response.info().get_content_charset('utf-8')
#data = json.loads(response.read().decode(encoding))
#print (data)
# Make sure that we are looking at HTML and not other things that
# are floating around on the internet (such as
# JavaScript files, CSS, or .PDFs for example)

if 'text/html' in response.getheader('Content-Type'):
htmlBytes = x #response.read()
file_name = str(numberVisited)+'.txt'

#print (htmlBytes)
#if (x == htmlBytes):
#print ("Same data")
#Note that feed() handles Strings well, but not bytes
#(A change from Python 2.x to Python 3.x)

htmlString = htmlBytes.decode("utf-8")
file = open(file_name, 'wb')
file.write(htmlBytes)
file.close()

f = open("FileCheckDatabase", 'a')
f.write(url+'\n')
f.close()
print ("File Writing Successful of"+str(numberVisited)+"!")
#print (htmlString)
self.feed(htmlString)

#print ("Checking!")
return self.links

else:
return []

# And finally here is our spider. It takes in an URL, a word to find,

# and the number of pages to search through before giving up

def spider(url, maxPages):

pagesToVisit = [url]
print (pagesToVisit)
numberVisited = 0

# The main loop. Create a LinkParser and get all the links on the page.
# Also search the page for the word or string
# In our getLinks function we return the web page
# (this is useful for searching for the word)
# and we return a set of links from that web page
# (this is useful for where to go next)

while numberVisited < maxPages and pagesToVisit != []:

numberVisited = numberVisited +1

# Start from the beginning of our collection of pages to visit:

url = pagesToVisit[0]
pagesToVisit = pagesToVisit[1:]

#print (pagesToVisit)

try:
print(numberVisited, "Visiting:", url)
parser = LinkParser()

#print (data, links)

links = parser.getLinks(url, numberVisited)

#print ('Link Count :'+str(len(links)))

pagesToVisit = pagesToVisit + links
#print (pagesToVisit[:10])
print ('List length count: '+str(len(pagesToVisit)))

#print (parser.getLinks(url))
#print ("Inside the try block")

#print ("Inside the true block!")

# Add the pages that we visited to the end of our collection
# of pages to visit:
#print (pagesToVisit)
#raise CustomException("Check-1")

except Exception as e:
print (str(e))
#print(" **Failed!**")

spider('https://en.wikipedia.org/wiki/Google', 50)
1.2. FILES CREATED
1.3. RUNNING SAMPLE FILE
2. COSINE SIMILARITY

2.1. CODE

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import glob

#query = input("Enter Query:")

query = "WikiPedia is a very good site for learning, and is very helpful for children"
file_names = []
d = {}
train_set = []
train_set.append(query)
flist = glob.glob('*.txt')

for fname in flist:

file_names.append(fname)
f = open(fname, encoding="UTF-8")
a = f.read()
train_set.append(a)
f.close()

#print ("Finished the Reading process")

#print (str(len(train_set))+' = Length of the document sections')
#train_set = [query, a, b]

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_set)

#print (file_names)
#print ("cosine scores ==>", cosine_similarity(tfidf_matrix_train[0:1], tfidf_matrix_train))

cosine_similar_value = []

for i in range(1, len(train_set)):

c = cosine_similarity(tfidf_matrix_train[0:1], tfidf_matrix_train[i])
c = str(c)
c = c[2:]
c = c[:-2]
c = float(c)
cosine_similar_value.append(c)
#print ("Finished finding the cosine values..")
#print (len(cosine_similar_value))

for j in range(0, len(file_names)):

d[j] = cosine_similar_value[j]

cosine_similar_value.sort(reverse=True)

#print (cosine_similar_value)

file = open('FileCheckDatabase', 'r')

data = file.read()

l = data.split()

for i in range(0, 10):

x = list(d.keys())[list(d.values()).index(cosine_similar_value[i])]
#print (list(d.keys())[list(d.values()).index(cosine_similar_value[i])])
del d[x]
h = file_names[x]
h = int(h[:-4])

#print (file_names[x])
#print (cosine_similar_value[i])

print (l[h-1])

3. TASK 3

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords

import numpy as np
import numpy.linalg as LA

train_set = ["The sky is blue.", "The sun is bright."] #

Documents

test_set = ["The sun in the sky is bright."] # Query

stopWords = stopwords.words('english')
vectorizer = CountVectorizer(stop_words = stopWords)

#print vectorizer
transformer = TfidfTransformer()

#print transformer
trainVectorizerArray = vectorizer.fit_transform(train_set).toarray()
testVectorizerArray = vectorizer.transform(test_set).toarray()

print 'Fit Vectorizer to train set', trainVectorizerArray

print 'Transform Vectorizer to test set', testVectorizerArray

transformer.fit(trainVectorizerArray)

print transformer.transform(trainVectorizerArray).toarray()

transformer.fit(testVectorizerArray)

tfidf = transformer.transform(testVectorizerArray)

print tfidf.todense()

4. TASK 4
import numpy as np
y = [1.0, 1.0, 1.0, 0.0, 0.0]
x = [0.0, 1.0, 0.0, 1.0, 1.0]

np.dot(x,y)

OUTPUT:
1.0

x_dot_y = sum([(1.0 * 0.0) + (1.0 * 1.0) + (1.0 * 0.0) +

(0.0 * 1.0) + (0.0 * 1.0)])

x_dot_y
OUTPUT
1.0

from numpy.linalg import norm

y = [1.0, 1.0, 1.0, 0.0, 0.0]

x = [0.0, 1.0, 0.0, 1.0, 1.0]

norm(x) * norm(y)

OUTPUT:

2.9999999999999996

import math

# with np.dot

math.sqrt(np.dot(x,x)) * math.sqrt(np.dot(y,y))

OUTPUT:
2.9999999999999996

Assessment - 2: - K Mary Nikitha
No ratings yet
Assessment - 2: - K Mary Nikitha
27 pages
Cs 3308 Unit 7 Programming Assignment
No ratings yet
Cs 3308 Unit 7 Programming Assignment
8 pages
Web Mining Practical File (NS)
No ratings yet
Web Mining Practical File (NS)
15 pages
Python File
No ratings yet
Python File
11 pages
Sans Titre
No ratings yet
Sans Titre
11 pages
IR Practical Code
No ratings yet
IR Practical Code
13 pages
03 Web Scraping
No ratings yet
03 Web Scraping
41 pages
PYTHON
No ratings yet
PYTHON
2 pages
Information Retrival
No ratings yet
Information Retrival
43 pages
Ballerono Cappuchino
No ratings yet
Ballerono Cappuchino
10 pages
Week2 and Week 3
No ratings yet
Week2 and Week 3
13 pages
Python Programs for File Operations and Data Handling
No ratings yet
Python Programs for File Operations and Data Handling
10 pages
Assignments
No ratings yet
Assignments
2 pages
Trip Planner Example
No ratings yet
Trip Planner Example
7 pages
Web Mining Lab Source Code 1-12 PRINT
No ratings yet
Web Mining Lab Source Code 1-12 PRINT
43 pages
Programming Assignment Unit 07 - CS 3308 - Information Retrieval - University of The People
No ratings yet
Programming Assignment Unit 07 - CS 3308 - Information Retrieval - University of The People
4 pages
Information Retrieval WA
No ratings yet
Information Retrieval WA
9 pages
Python Lab ALL 10 Prgms
No ratings yet
Python Lab ALL 10 Prgms
16 pages
Unit I
No ratings yet
Unit I
12 pages
Python Solutions for Coding Challenges
No ratings yet
Python Solutions for Coding Challenges
8 pages
Message 12 3
No ratings yet
Message 12 3
10 pages
A Simple Python Web Crawler...
100% (1)
A Simple Python Web Crawler...
5 pages
22f-3386 Lab 3 6BB
No ratings yet
22f-3386 Lab 3 6BB
20 pages
6-10 Python Lab Program
No ratings yet
6-10 Python Lab Program
16 pages
Python File and Data Operations
No ratings yet
Python File and Data Operations
34 pages
Python Input and Output
No ratings yet
Python Input and Output
8 pages
卂几ㄖ几ㄚ
No ratings yet
卂几ㄖ几ㄚ
8 pages
Irs 122010304057 PDF
No ratings yet
Irs 122010304057 PDF
23 pages
Web Scraping
No ratings yet
Web Scraping
11 pages
Python Cheat Sheet - The Basics CC
No ratings yet
Python Cheat Sheet - The Basics CC
2 pages
4aeee7-Ba25-Ff2e-30d7-63d306a7270 Open Ai Playground Example Prompts - Google Sheets
No ratings yet
4aeee7-Ba25-Ff2e-30d7-63d306a7270 Open Ai Playground Example Prompts - Google Sheets
8 pages
3252 Ids 10
No ratings yet
3252 Ids 10
5 pages
Python Cheat Sheet - The Basics Coursera
No ratings yet
Python Cheat Sheet - The Basics Coursera
2 pages
25 Awesome Python Scripts
No ratings yet
25 Awesome Python Scripts
26 pages
Python Cheat Sheet - The Basics Edx
No ratings yet
Python Cheat Sheet - The Basics Edx
2 pages
Name "Dave": End - Quote Find Start - Quote
No ratings yet
Name "Dave": End - Quote Find Start - Quote
3 pages
Python Data Structures in Petroleum Engineering
No ratings yet
Python Data Structures in Petroleum Engineering
3 pages
Advanced Coding Assignment 2
No ratings yet
Advanced Coding Assignment 2
8 pages
Python Module 4
No ratings yet
Python Module 4
12 pages
Programming 2 Lectures
No ratings yet
Programming 2 Lectures
52 pages
Python Algorithms & Data Structures
No ratings yet
Python Algorithms & Data Structures
5 pages
Python Using AI
No ratings yet
Python Using AI
9 pages
AI Lab Tasks for Python Developers
No ratings yet
AI Lab Tasks for Python Developers
12 pages
Python Only Code Edited
No ratings yet
Python Only Code Edited
45 pages
Web Scraping & Inverted Index Guide
No ratings yet
Web Scraping & Inverted Index Guide
10 pages
IR Practical
No ratings yet
IR Practical
24 pages
AI Practical Assignments
No ratings yet
AI Practical Assignments
12 pages
Python Programs
No ratings yet
Python Programs
20 pages
Python Cheat Sheet - The Basics Coursera
No ratings yet
Python Cheat Sheet - The Basics Coursera
2 pages
Python Cheatsheet
No ratings yet
Python Cheatsheet
3 pages
Python Basics for Beginners
100% (1)
Python Basics for Beginners
10 pages
PY0101 - Python For Data Science, AI, & Development Cheat Sheet
No ratings yet
PY0101 - Python For Data Science, AI, & Development Cheat Sheet
2 pages
DS 8-12
No ratings yet
DS 8-12
5 pages
Web Crawler PY
No ratings yet
Web Crawler PY
27 pages
Activity 1
No ratings yet
Activity 1
8 pages
QUICK Scheme
No ratings yet
QUICK Scheme
25 pages
cs3329 Assignment
No ratings yet
cs3329 Assignment
9 pages
Winter MathMania
No ratings yet
Winter MathMania
4 pages
7 Tools of Statistical Process Control
100% (1)
7 Tools of Statistical Process Control
3 pages
Data Types and DML
No ratings yet
Data Types and DML
2 pages
Random Variables
No ratings yet
Random Variables
24 pages
BSM311 OperationsManagement Zaracasestudy
100% (1)
BSM311 OperationsManagement Zaracasestudy
24 pages
Motion WS1
No ratings yet
Motion WS1
3 pages
CalculusA PracticeExam
No ratings yet
CalculusA PracticeExam
2 pages
S3 Ch.8 3D Figures
No ratings yet
S3 Ch.8 3D Figures
4 pages
Module4 Electrostatic Boundary Value Problem PDF
No ratings yet
Module4 Electrostatic Boundary Value Problem PDF
35 pages
Surveying May 2021
No ratings yet
Surveying May 2021
3 pages
SSC CGL 9th Dec 2022 Shift-2 by Cracku
No ratings yet
SSC CGL 9th Dec 2022 Shift-2 by Cracku
29 pages
MCA 2nd Semester Math Assignment
No ratings yet
MCA 2nd Semester Math Assignment
2 pages
CH 6
No ratings yet
CH 6
2 pages
Preview-9781108890656 A45556468
No ratings yet
Preview-9781108890656 A45556468
89 pages
Charles Shephard - Gann Cycles - A Time and Price Cycle Analysis PDF
93% (14)
Charles Shephard - Gann Cycles - A Time and Price Cycle Analysis PDF
147 pages
Sys Eqns Applications 3x3
No ratings yet
Sys Eqns Applications 3x3
11 pages
Apc 4.00 Unit 4a Packet 2021 v2
No ratings yet
Apc 4.00 Unit 4a Packet 2021 v2
22 pages
LAS Q2 Gen Math Week 7 - Logic
No ratings yet
LAS Q2 Gen Math Week 7 - Logic
4 pages
IMO Level 2 Class 5 Year 2017 18 Part 3
No ratings yet
IMO Level 2 Class 5 Year 2017 18 Part 3
6 pages
Curriculum For Broadcast Level-6-1
No ratings yet
Curriculum For Broadcast Level-6-1
94 pages
Binomial Distribution
100% (1)
Binomial Distribution
15 pages
Math 9 3rd Periodical Examination
No ratings yet
Math 9 3rd Periodical Examination
3 pages
Gat Eee Nba Psa-2 18eee71 Co 2021 22
No ratings yet
Gat Eee Nba Psa-2 18eee71 Co 2021 22
2 pages
Framing and Presentation Mode Effects in Professional Judgment: Auditors' Internal Control Judgments and Substantive Testing Decisions
No ratings yet
Framing and Presentation Mode Effects in Professional Judgment: Auditors' Internal Control Judgments and Substantive Testing Decisions
15 pages
Harrison's Principles of Internal Medicine Self-Assessment and Board Review, 20th Edition Charles Wiener PDF Download
100% (1)
Harrison's Principles of Internal Medicine Self-Assessment and Board Review, 20th Edition Charles Wiener PDF Download
152 pages
Steel Mechanical Properties Guide
No ratings yet
Steel Mechanical Properties Guide
4 pages
Polynomial Curve Fitting
No ratings yet
Polynomial Curve Fitting
44 pages
Understanding Polars Without Math
100% (1)
Understanding Polars Without Math
176 pages

Sahil Malhotra 16 BCE 0113 Web Mining L51+L52: 1. Universal Crawling 1.1. CODE

Uploaded by

Sahil Malhotra 16 BCE 0113 Web Mining L51+L52: 1. Universal Crawling 1.1. CODE

Uploaded by

SAHIL MALHOTRA

from html.parser import HTMLParser

# We are going to create a class called LinkParser that inherits some

# This is a function that HTMLParser normally has

# This is a new function that we are creating to get links

# Use the urlopen function from the standard Python 3 library

# And finally here is our spider. It takes in an URL, a word to find,

def spider(url, maxPages):

while numberVisited < maxPages and pagesToVisit != []:

# Start from the beginning of our collection of pages to visit:

#print (data, links)

#print ('Link Count :'+str(len(links)))

#print ("Inside the true block!")

from sklearn.feature_extraction.text import CountVectorizer

#query = input("Enter Query:")

for fname in flist:

#print ("Finished the Reading process")

for i in range(1, len(train_set)):

for j in range(0, len(file_names)):

file = open('FileCheckDatabase', 'r')

for i in range(0, 10):

from sklearn.feature_extraction.text import CountVectorizer

train_set = ["The sky is blue.", "The sun is bright."] #

test_set = ["The sun in the sky is bright."] # Query

print 'Fit Vectorizer to train set', trainVectorizerArray

print 'Transform Vectorizer to test set', testVectorizerArray

x_dot_y = sum([(1.0 * 0.0) + (1.0 * 1.0) + (1.0 * 0.0) +

(0.0 * 1.0) + (0.0 * 1.0)])

from numpy.linalg import norm

y = [1.0, 1.0, 1.0, 0.0, 0.0]

x = [0.0, 1.0, 0.0, 1.0, 1.0]

You might also like