[go: up one dir, main page]

0% found this document useful (0 votes)
73 views11 pages

Sahil Malhotra 16 BCE 0113 Web Mining L51+L52: 1. Universal Crawling 1.1. CODE

1. The document discusses various techniques for web mining including universal crawling, cosine similarity calculations, and term frequency-inverse document frequency (TF-IDF). 2. For universal crawling, it presents code to parse HTML links and recursively crawl websites to a given depth, saving pages to text files. 3. To calculate cosine similarity between a query and documents, it extracts TF-IDF features and computes cosine similarity scores to find the most similar documents to a query.

Uploaded by

sahil
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
73 views11 pages

Sahil Malhotra 16 BCE 0113 Web Mining L51+L52: 1. Universal Crawling 1.1. CODE

1. The document discusses various techniques for web mining including universal crawling, cosine similarity calculations, and term frequency-inverse document frequency (TF-IDF). 2. For universal crawling, it presents code to parse HTML links and recursively crawl websites to a given depth, saving pages to text files. 3. To calculate cosine similarity between a query and documents, it extracts TF-IDF features and computes cosine similarity scores to find the most similar documents to a query.

Uploaded by

sahil
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 11

SAHIL MALHOTRA

16 BCE 0113
WEB MINING L51+L52
1. UNIVERSAL CRAWLING

1.1. CODE

from html.parser import HTMLParser


from urllib.request import urlopen
from urllib import parse
import json

# We are going to create a class called LinkParser that inherits some


# methods from HTMLParser which is why it is passed into the definition

class LinkParser(HTMLParser):

# This is a function that HTMLParser normally has


# but we are adding some functionality to it
def handle_starttag(self, tag, attrs):
# We are looking for the begining of a link. Links normally look
# like <a href="www.someurl.com"></a>
#print ('Entered the handler!')

if tag == 'a':
for (key, value) in attrs:
if key == 'href':
# We are grabbing the new URL. We are also adding the
# base URL to it. For example:
# www.netinstructions.com is the base and
# somepage.html is the new URL (a relative URL)
#
# We combine a relative URL with the base URL to create
# an absolute URL like:
# www.netinstructions.com/somepage.html
newUrl = parse.urljoin(self.baseUrl, value)
# And add it to our colection of links:
#self.links = self.links + [newUrl]
self.links.append(newUrl)

# This is a new function that we are creating to get links


# that our spider() function will call
def getLinks(self, url, numberVisited):

self.links = []
# Remember the base URL which will be important when creating
# absolute URLs

self.baseUrl = url

# Use the urlopen function from the standard Python 3 library


response = urlopen(url)

#print (self.links)
x = response.read()

#print (response.getheader('Content-Type'))
#handle_starttag(self.links, tag, attrs)
#encoding = response.info().get_content_charset('utf-8')
#data = json.loads(response.read().decode(encoding))
#print (data)
# Make sure that we are looking at HTML and not other things that
# are floating around on the internet (such as
# JavaScript files, CSS, or .PDFs for example)

if 'text/html' in response.getheader('Content-Type'):
htmlBytes = x #response.read()
file_name = str(numberVisited)+'.txt'

#print (htmlBytes)
#if (x == htmlBytes):
#print ("Same data")
#Note that feed() handles Strings well, but not bytes
#(A change from Python 2.x to Python 3.x)

htmlString = htmlBytes.decode("utf-8")
file = open(file_name, 'wb')
file.write(htmlBytes)
file.close()

f = open("FileCheckDatabase", 'a')
f.write(url+'\n')
f.close()
print ("File Writing Successful of"+str(numberVisited)+"!")
#print (htmlString)
self.feed(htmlString)

#print ("Checking!")
return self.links

else:
return []

# And finally here is our spider. It takes in an URL, a word to find,


# and the number of pages to search through before giving up

def spider(url, maxPages):


pagesToVisit = [url]
print (pagesToVisit)
numberVisited = 0

# The main loop. Create a LinkParser and get all the links on the page.
# Also search the page for the word or string
# In our getLinks function we return the web page
# (this is useful for searching for the word)
# and we return a set of links from that web page
# (this is useful for where to go next)

while numberVisited < maxPages and pagesToVisit != []:


numberVisited = numberVisited +1

# Start from the beginning of our collection of pages to visit:


url = pagesToVisit[0]
pagesToVisit = pagesToVisit[1:]

#print (pagesToVisit)

try:
print(numberVisited, "Visiting:", url)
parser = LinkParser()

#print (data, links)


links = parser.getLinks(url, numberVisited)

#print ('Link Count :'+str(len(links)))


pagesToVisit = pagesToVisit + links
#print (pagesToVisit[:10])
print ('List length count: '+str(len(pagesToVisit)))

#print (parser.getLinks(url))
#print ("Inside the try block")

#print ("Inside the true block!")


# Add the pages that we visited to the end of our collection
# of pages to visit:
#print (pagesToVisit)
#raise CustomException("Check-1")

except Exception as e:
print (str(e))
#print(" **Failed!**")

spider('https://en.wikipedia.org/wiki/Google', 50)
1.2. FILES CREATED
1.3. RUNNING SAMPLE FILE
2. COSINE SIMILARITY

2.1. CODE

from sklearn.feature_extraction.text import CountVectorizer


from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import glob

#query = input("Enter Query:")

query = "WikiPedia is a very good site for learning, and is very helpful for children"
file_names = []
d = {}
train_set = []
train_set.append(query)
flist = glob.glob('*.txt')

for fname in flist:


file_names.append(fname)
f = open(fname, encoding="UTF-8")
a = f.read()
train_set.append(a)
f.close()

#print ("Finished the Reading process")


#print (str(len(train_set))+' = Length of the document sections')
#train_set = [query, a, b]

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_set)

#print (file_names)
#print ("cosine scores ==>", cosine_similarity(tfidf_matrix_train[0:1], tfidf_matrix_train))

cosine_similar_value = []

for i in range(1, len(train_set)):


c = cosine_similarity(tfidf_matrix_train[0:1], tfidf_matrix_train[i])
c = str(c)
c = c[2:]
c = c[:-2]
c = float(c)
cosine_similar_value.append(c)
#print ("Finished finding the cosine values..")
#print (len(cosine_similar_value))

for j in range(0, len(file_names)):


d[j] = cosine_similar_value[j]

cosine_similar_value.sort(reverse=True)

#print (cosine_similar_value)

file = open('FileCheckDatabase', 'r')


data = file.read()

l = data.split()

for i in range(0, 10):


x = list(d.keys())[list(d.values()).index(cosine_similar_value[i])]
#print (list(d.keys())[list(d.values()).index(cosine_similar_value[i])])
del d[x]
h = file_names[x]
h = int(h[:-4])

#print (file_names[x])
#print (cosine_similar_value[i])

print (l[h-1])

3. TASK 3

from sklearn.feature_extraction.text import CountVectorizer


from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords

import numpy as np
import numpy.linalg as LA

train_set = ["The sky is blue.", "The sun is bright."] #

Documents

test_set = ["The sun in the sky is bright."] # Query


stopWords = stopwords.words('english')
vectorizer = CountVectorizer(stop_words = stopWords)

#print vectorizer
transformer = TfidfTransformer()

#print transformer
trainVectorizerArray = vectorizer.fit_transform(train_set).toarray()
testVectorizerArray = vectorizer.transform(test_set).toarray()

print 'Fit Vectorizer to train set', trainVectorizerArray

print 'Transform Vectorizer to test set', testVectorizerArray

transformer.fit(trainVectorizerArray)

print

print transformer.transform(trainVectorizerArray).toarray()

transformer.fit(testVectorizerArray)

print

tfidf = transformer.transform(testVectorizerArray)

print tfidf.todense()

4. TASK 4
import numpy as np
y = [1.0, 1.0, 1.0, 0.0, 0.0]
x = [0.0, 1.0, 0.0, 1.0, 1.0]

np.dot(x,y)

OUTPUT:
1.0

x_dot_y = sum([(1.0 * 0.0) + (1.0 * 1.0) + (1.0 * 0.0) +

(0.0 * 1.0) + (0.0 * 1.0)])

x_dot_y
OUTPUT
1.0

from numpy.linalg import norm

y = [1.0, 1.0, 1.0, 0.0, 0.0]

x = [0.0, 1.0, 0.0, 1.0, 1.0]

norm(x) * norm(y)

OUTPUT:

2.9999999999999996

import math

# with np.dot

math.sqrt(np.dot(x,x)) * math.sqrt(np.dot(y,y))

OUTPUT:
2.9999999999999996

You might also like