SAHIL MALHOTRA
16 BCE 0113
WEB MINING L51+L52
1. UNIVERSAL CRAWLING
1.1. CODE
from html.parser import HTMLParser
from urllib.request import urlopen
from urllib import parse
import json
# We are going to create a class called LinkParser that inherits some
# methods from HTMLParser which is why it is passed into the definition
class LinkParser(HTMLParser):
# This is a function that HTMLParser normally has
# but we are adding some functionality to it
def handle_starttag(self, tag, attrs):
# We are looking for the begining of a link. Links normally look
# like <a href="www.someurl.com"></a>
#print ('Entered the handler!')
if tag == 'a':
for (key, value) in attrs:
if key == 'href':
# We are grabbing the new URL. We are also adding the
# base URL to it. For example:
# www.netinstructions.com is the base and
# somepage.html is the new URL (a relative URL)
#
# We combine a relative URL with the base URL to create
# an absolute URL like:
# www.netinstructions.com/somepage.html
newUrl = parse.urljoin(self.baseUrl, value)
# And add it to our colection of links:
#self.links = self.links + [newUrl]
self.links.append(newUrl)
# This is a new function that we are creating to get links
# that our spider() function will call
def getLinks(self, url, numberVisited):
self.links = []
# Remember the base URL which will be important when creating
# absolute URLs
self.baseUrl = url
# Use the urlopen function from the standard Python 3 library
response = urlopen(url)
#print (self.links)
x = response.read()
#print (response.getheader('Content-Type'))
#handle_starttag(self.links, tag, attrs)
#encoding = response.info().get_content_charset('utf-8')
#data = json.loads(response.read().decode(encoding))
#print (data)
# Make sure that we are looking at HTML and not other things that
# are floating around on the internet (such as
# JavaScript files, CSS, or .PDFs for example)
if 'text/html' in response.getheader('Content-Type'):
htmlBytes = x #response.read()
file_name = str(numberVisited)+'.txt'
#print (htmlBytes)
#if (x == htmlBytes):
#print ("Same data")
#Note that feed() handles Strings well, but not bytes
#(A change from Python 2.x to Python 3.x)
htmlString = htmlBytes.decode("utf-8")
file = open(file_name, 'wb')
file.write(htmlBytes)
file.close()
f = open("FileCheckDatabase", 'a')
f.write(url+'\n')
f.close()
print ("File Writing Successful of"+str(numberVisited)+"!")
#print (htmlString)
self.feed(htmlString)
#print ("Checking!")
return self.links
else:
return []
# And finally here is our spider. It takes in an URL, a word to find,
# and the number of pages to search through before giving up
def spider(url, maxPages):
pagesToVisit = [url]
print (pagesToVisit)
numberVisited = 0
# The main loop. Create a LinkParser and get all the links on the page.
# Also search the page for the word or string
# In our getLinks function we return the web page
# (this is useful for searching for the word)
# and we return a set of links from that web page
# (this is useful for where to go next)
while numberVisited < maxPages and pagesToVisit != []:
numberVisited = numberVisited +1
# Start from the beginning of our collection of pages to visit:
url = pagesToVisit[0]
pagesToVisit = pagesToVisit[1:]
#print (pagesToVisit)
try:
print(numberVisited, "Visiting:", url)
parser = LinkParser()
#print (data, links)
links = parser.getLinks(url, numberVisited)
#print ('Link Count :'+str(len(links)))
pagesToVisit = pagesToVisit + links
#print (pagesToVisit[:10])
print ('List length count: '+str(len(pagesToVisit)))
#print (parser.getLinks(url))
#print ("Inside the try block")
#print ("Inside the true block!")
# Add the pages that we visited to the end of our collection
# of pages to visit:
#print (pagesToVisit)
#raise CustomException("Check-1")
except Exception as e:
print (str(e))
#print(" **Failed!**")
spider('https://en.wikipedia.org/wiki/Google', 50)
1.2. FILES CREATED
1.3. RUNNING SAMPLE FILE
2. COSINE SIMILARITY
2.1. CODE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import glob
#query = input("Enter Query:")
query = "WikiPedia is a very good site for learning, and is very helpful for children"
file_names = []
d = {}
train_set = []
train_set.append(query)
flist = glob.glob('*.txt')
for fname in flist:
file_names.append(fname)
f = open(fname, encoding="UTF-8")
a = f.read()
train_set.append(a)
f.close()
#print ("Finished the Reading process")
#print (str(len(train_set))+' = Length of the document sections')
#train_set = [query, a, b]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_set)
#print (file_names)
#print ("cosine scores ==>", cosine_similarity(tfidf_matrix_train[0:1], tfidf_matrix_train))
cosine_similar_value = []
for i in range(1, len(train_set)):
c = cosine_similarity(tfidf_matrix_train[0:1], tfidf_matrix_train[i])
c = str(c)
c = c[2:]
c = c[:-2]
c = float(c)
cosine_similar_value.append(c)
#print ("Finished finding the cosine values..")
#print (len(cosine_similar_value))
for j in range(0, len(file_names)):
d[j] = cosine_similar_value[j]
cosine_similar_value.sort(reverse=True)
#print (cosine_similar_value)
file = open('FileCheckDatabase', 'r')
data = file.read()
l = data.split()
for i in range(0, 10):
x = list(d.keys())[list(d.values()).index(cosine_similar_value[i])]
#print (list(d.keys())[list(d.values()).index(cosine_similar_value[i])])
del d[x]
h = file_names[x]
h = int(h[:-4])
#print (file_names[x])
#print (cosine_similar_value[i])
print (l[h-1])
3. TASK 3
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
import numpy as np
import numpy.linalg as LA
train_set = ["The sky is blue.", "The sun is bright."] #
Documents
test_set = ["The sun in the sky is bright."] # Query
stopWords = stopwords.words('english')
vectorizer = CountVectorizer(stop_words = stopWords)
#print vectorizer
transformer = TfidfTransformer()
#print transformer
trainVectorizerArray = vectorizer.fit_transform(train_set).toarray()
testVectorizerArray = vectorizer.transform(test_set).toarray()
print 'Fit Vectorizer to train set', trainVectorizerArray
print 'Transform Vectorizer to test set', testVectorizerArray
transformer.fit(trainVectorizerArray)
print
print transformer.transform(trainVectorizerArray).toarray()
transformer.fit(testVectorizerArray)
print
tfidf = transformer.transform(testVectorizerArray)
print tfidf.todense()
4. TASK 4
import numpy as np
y = [1.0, 1.0, 1.0, 0.0, 0.0]
x = [0.0, 1.0, 0.0, 1.0, 1.0]
np.dot(x,y)
OUTPUT:
1.0
x_dot_y = sum([(1.0 * 0.0) + (1.0 * 1.0) + (1.0 * 0.0) +
(0.0 * 1.0) + (0.0 * 1.0)])
x_dot_y
OUTPUT
1.0
from numpy.linalg import norm
y = [1.0, 1.0, 1.0, 0.0, 0.0]
x = [0.0, 1.0, 0.0, 1.0, 1.0]
norm(x) * norm(y)
OUTPUT:
2.9999999999999996
import math
# with np.dot
math.sqrt(np.dot(x,x)) * math.sqrt(np.dot(y,y))
OUTPUT:
2.9999999999999996