M.
Sri Phani Bhushan
AP19110010110
In [34]:
import nltk
Splitting the text file into 1400 different text files
In [58]:
i=1
punctuations='''()-[]{};:\/,.<>@$''^+1234567890*%&=?'''
with open('cran/cran.all.1400','r') as f:
for line in f:
if line[:2]=='.I':
file=open(f'Dataset assignment-1\{line[3:len(line)-1]}.txt','x')
elif line[:2] in ['.T','.B','.A','.W']:
continue
elif line=='':
file.close()
else:
no_punct_line=""
for char in line:
if(char not in punctuations):
no_punct_line+=char
file.write(no_punct_line)
In [59]:
file=open(f'Dataset assignment-1\1400.txt','w')
file.close()
Tokenizing and stemming the documents
In [114]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
tokens = {}
for i in range(1, 1401):
file = open(f'Dataset assignment-1\{i}.txt')
for line in file:
for word in line.split():
w = str(ps.stem(word))
if w in tokens:
tokens[w][0] = len(tokens[w][1])
tokens[w][1].add(i)
else:
tokens[w] = [1,set()]
tokens[w][1].add(i)
# tokens[w] = 1
file.close()
In [115]:
print(tokens.keys())
dict_keys(['experiment', 'investig', 'of', 'the', 'aerodynam', 'a', 'win
g', 'in', 'slipstream', 'brenckmanm', 'j', 'ae', 'sc', 'an', 'studi', 'pro
pel', 'wa', 'made', 'order', 'to', 'determin', 'spanwis', 'distribut', 'li
ft', 'increas', 'due', 'at', 'differ', 'angl', 'attack', 'and', 'free', 's
tream', 'veloc', 'ratio', 'result', 'were', 'intend', 'part', 'as', 'eval
u', 'basi', 'for', 'theoret', 'treatment', 'thi', 'problem', 'compar', 'sp
an', 'load', 'curv', 'togeth', 'with', 'support', 'evid', 'show', 'that',
'substanti', 'increment', 'produc', 'by', 'destal', 'or', 'boundarylayerco
ntrol', 'effect', 'integr', 'remain', 'after', 'subtract', 'found', 'agr
e', 'well', 'potenti', 'flow', 'theori', 'empir', 'specif', 'configur', 'e
xperi', 'simpl', 'shear', 'past', 'flat', 'plate', 'incompress', 'fluid',
'small', 'viscos', 'tingyili', 'depart', 'aeronaut', 'engin', 'renssela',
'polytechn', 'institut', 'troy', 'ny', 'highspe', 'viscou', 'twodimensio
n', 'bodi', 'it', 'is', 'usual', 'necessari', 'consid', 'shock', 'wave',
'emit', 'from', 'nose', 'lead', 'edg', 'consequ', 'there', 'exist', 'invis
cid', 'rotat', 'region', 'between', 'boundari', 'layer', 'such', 'situat',
'aris', 'instanc', 'hyperson', 'somewhat', 'prandtl', 'classic', 'boundary
lay', 'origin', 'outsid', 'irrot', 'while', 'must', 'be', 'possibl', 'vort
ic', 'have', 'been', 'recent', 'discuss', 'ferri', 'libbi', 'present', 'pa
In [116]:
len(tokens)
Out[116]:
7480
In [488]:
#Sorting the tokens
sorted_tokens=dict(sorted(tokens.items(),key=lambda x:x[1][0],reverse=True))
print(sorted_tokens)
{'of': [1395, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 3
6, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 7
3, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91,
92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 10
8, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 1
23, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137,
138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152,
153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,
168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182,
183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197,
198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212,
213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227,
228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242,
243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257,
258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272,
273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287,
288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302,
In [118]:
print(sorted_tokens.keys())
'environment', 'page', 'philosophi', 'cut', 'socal', 'paramount', 'countr
i', 'drastic', 'lester', 'classif', 'log', 'jmathphi', 'hypergeometr', 'ow
enpr', 'insert', 'grid', 'grade', 'typifi', 'neighbourhood', 'signal', 'fi
lter', 'db', 'lubric', 'mathieu', 'stratiform', 'apprais', 'elabor', 'wort
h', 'quadrupol', 'shapiro', 'telemet', 'track', 'interplanetari', 'refract
ori', 'aeroquart', 'ward', 'biconvex', 'xy', 'gaussian', 'immers', 'millik
ancb', 'belong', 'lowturbul', 'unfortun', 'ship', 'hodograph', 'zeroord',
'miss', 'mar', 'overshoot', 'brake', 'walker', 'dispers', 'inert', 'cell',
'lowdens', 'photographi', 'annulu', 'holderdw', 'cravenah', 'entrain', 'de
duct', 'utia', 'maxwellian', 'molyneuxwg', 'tnstruct', 'compressibleflow',
'monatom', 'liter', 'unusu', 'yashuram', 'jphyssoc', 'lengthwis', 'aj', 'p
ercentthick', 'idealga', 'rankin', 'seal', 'lowfrequ', 'omit', 'british',
'constrain', 'screen', 'stand', 'sideforc', 'manufactur', 'stratfordb', 'c
oncurr', 'bubbl', 'singlestag', 'stator', 'powel', 'astronaut', 'ensur',
'smoke', 'quasicylindr', 'quasi', 'bank', 'angularli', 'weapon', 'tangen
c', 'tangentcon', 'quasicylind', 'referenc', 'code', 'kettledj', 'strut',
'gooderumpb', 'woodgp', 'visualis', 'visibl', 'royal', 'deposit', 'oil',
'compact', 'white', 'hard', 'eventu', 'said', 'weberj', 'squir', 'sweptw',
'lilleygm', 'civil', 'poiseuil', 'nuclear', 'jappphi', 'eckhausw', 'con',
'hour', 'jd', 'polish', 'stagnationtowal', 'mission', 'evapor', 'entail',
In [119]:
# creating the dataframe
import pandas as pd
words = list(sorted_tokens.keys())
temp = list(sorted_tokens.values())
count, freq = [], []
for i in range(len(temp)):
count.append(len(temp[i][1]))
freq.append(list(temp[i][1]))
In [120]:
dict_tokens = {'Tokens': words, 'DOC Frequency':count, 'Document ID': freq}
df = pd.DataFrame.from_dict(dict_tokens)
df.head()
Out[120]:
Tokens DOC Frequency Document ID
0 of 1395 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
1 the 1391 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
2 and 1342 [1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
3 a 1307 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
4 to 1252 [1, 2, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 1...
In [121]:
#stopwords removal
dict_tokens = {'Tokens': words, 'DOC Frequency':count, 'DocumentID': freq}
df = pd.DataFrame.from_dict(dict_tokens).set_index("Tokens")
stopwords = df.head(30)
df.drop(index=df.index[:30], axis=0, inplace=True)
stopwords.to_csv('stopwords.csv')
In [122]:
# printing tokens
df=df.sort_values('Tokens')
df.to_csv('index.csv')
In [123]:
# printing tokens
index_df=pd.read_csv("index.csv")
index_df.head()
Out[123]:
Tokens DOC Frequency DocumentID
0 aaaero 1 [1111]
1 aaaeroconf 1 [899]
2 aasu 1 [722]
3 ab 3 [744, 924, 1381]
4 abbott 1 [1340]
In [124]:
# Stopwords csv
stopwords_df=pd.read_csv("stopwords.csv")
stopwords_df.head(15)
Out[124]:
Tokens DOC Frequency DocumentID
0 of 1395 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
1 the 1391 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
2 and 1342 [1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ...
3 a 1307 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...
4 to 1252 [1, 2, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 1...
5 in 1241 [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15...
6 is 1151 [2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,...
7 for 1145 [1, 2, 3, 4, 5, 6, 9, 10, 13, 14, 15, 16, 17, ...
8 are 1029 [3, 4, 5, 6, 7, 11, 12, 14, 15, 17, 18, 19, 20...
9 with 1010 [1, 3, 4, 6, 7, 9, 11, 12, 13, 14, 15, 16, 17,...
10 on 913 [7, 8, 9, 11, 13, 14, 15, 17, 18, 19, 21, 22, ...
11 by 854 [1, 2, 4, 6, 7, 9, 13, 14, 15, 16, 17, 20, 21,...
12 that 805 [1, 2, 6, 7, 8, 9, 10, 13, 14, 15, 16, 17, 18,...
13 an 796 [1, 2, 8, 9, 10, 11, 14, 15, 16, 17, 19, 21, 2...
14 at 771 [1, 5, 6, 7, 8, 9, 10, 11, 13, 14, 18, 19, 24,...
Query processing
In [125]:
i=1
punctuations='''()-[]{};:\/,.<>@$^*%&'''
with open('cran/cran.qry','r') as f:
for line in f:
if line[:2]=='.I':
file=open(f'Cran Query\{i}.txt','x')
i+=1
elif line[:2] in ['.T','.B','.A','.W']:
continue
elif line=='':
file.close()
else:
no_punct_line=""
for char in line:
if(char not in punctuations):
no_punct_line+=char
file.write(no_punct_line)
...
In [126]:
file=open(f'Cran Query\225.txt','w')
file.close()
In [505]:
stopwords_list
Out[505]:
['of',
'the',
'and',
'a',
'to',
'in',
'is',
'for',
'are',
'with',
'on',
'by',
'that',
'an',
'at',
'be',
'flow',
'result',
'thi',
'as',
'from',
'it',
'which',
'number',
'effect',
'pressur',
'use',
'present',
'j',
'obtain']
Query execution-AND
In [506]:
output = dict()
for i in range(1,226):
with open(f'Cran Query\{i}.txt', 'r') as f:
mat = []
for line in f:
for word in line.split():
if word in sorted_tokens:
mat.append(sorted_tokens[word][1])
k = mat[0]
for i in range(1, len(mat)):
k = k.intersection(mat[i])
if len(k)!=0:
output[i]=list(k)
output
Out[506]:
{9: [329, 142, 1263, 625, 1107, 1300, 1204, 983, 666, 1307, 1213],
3: [1040, 185, 1250, 486],
4: [315, 1323, 131],
1: [2,
3,
4,
1029,
8,
9,
522,
13,
525,
15,
527,
1042,
21,
22,
In [531]:
# QUERY-OR
output = dict()
for i in range(1,226):
with open(f'Cran Query\{i}.txt', 'r') as f:
mat = []
for line in f:
for word in line.split():
if word in sorted_tokens:
mat.append(sorted_tokens[word][1])
k = mat[0]
for i in range(1, len(mat)):
k = k.union(mat[i])
if len(k)!=0:
output[i]=list(k)
output
Out[531]:
{7: [1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15,
16,
17,
Query-1
In [507]:
# Creating a dataframe for the documents of the query-9
query_df=pd.DataFrame(output[1],columns=["docid"])
query_df["qid"]=1
query_df
Out[507]:
docid qid
0 2 1
1 3 1
2 4 1
3 1029 1
4 8 1
... ... ...
211 493 1
212 1008 1
213 500 1
214 1013 1
215 1014 1
216 rows × 2 columns
In [508]:
# creating a dataframe for the query and relevance scores
import pandas as pd
re_pd=pd.read_csv('cran/cranqrel.csv',names=["qid","docid","rel"])
re_pd
Out[508]:
qid docid rel
0 1 184 2
1 1 29 2
2 1 31 2
3 1 12 3
4 1 51 3
... ... ... ...
1832 225 1062 3
1833 225 1074 3
1834 225 1075 3
1835 225 1213 3
1836 225 1188 -1
1837 rows × 3 columns
In [509]:
# merging the dataframe for the relevant retives docs
final_df=pd.merge(re_pd, query_df, on=["qid","docid"], how='inner')
final_df=final_df[final_df.rel<3]
final_df
Out[509]:
qid docid rel
0 1 29 2
1 1 31 2
5 1 486 -1
In [510]:
# defining values for calculating precision recall
relevant_retrived=len(final_df)
total_relevant=len(re_pd[re_pd.qid == 1])
total_retrived=len(query_df)
In [511]:
print(relevant_retrived)
print(total_relevant)
print(total_retrived)
29
216
In [512]:
def precision(a,b):
return a/b
def recall(c,d):
return c/d
In [513]:
precision_9=precision(relevant_retrived,total_retrived)
recall_9=recall(relevant_retrived,total_relevant)
measures_dict={}
measures_dict["Query1"]=[precision_9,recall_9]
In [514]:
measures_dict
Out[514]:
{'Query1': [0.013888888888888888, 0.10344827586206896]}
QID-3
In [515]:
def score(a):
query_df=pd.DataFrame(output[a],columns=["docid"])
query_df["qid"]=a
final_df=pd.merge(re_pd, query_df, on=["qid","docid"], how='inner')
final_df=final_df[final_df.rel<3]
relevant_retrived=len(final_df)
print(final_df)
total_relevant=len(re_pd[re_pd.qid == a])
total_retrived=len(query_df)
precision_3=precision(relevant_retrived,total_retrived)
recall_3=recall(relevant_retrived,total_relevant)
measures_dict["Query"+str(a)]=[precision_3,recall_3]
print("Total relevant documents are {}".format(total_relevant))
print("Total relevant retrived documents are {}".format(relevant_retrived))
print("Total retrived documents are {}".format(total_retrived))
return measures_dict
In [516]:
score(3)
Empty DataFrame
Columns: [qid, docid, rel]
Index: []
Total relevant documents are 9
Total relevant retrived documents are 0
Total retrived documents are 4
Out[516]:
{'Query1': [0.013888888888888888, 0.10344827586206896], 'Query3': [0.0, 0.
0]}
In [517]:
measures_dict
Out[517]:
{'Query1': [0.013888888888888888, 0.10344827586206896], 'Query3': [0.0, 0.
0]}
QID-15
In [518]:
score(15)
qid docid rel
0 15 463 1
2 15 497 -1
Total relevant documents are 3
Total relevant retrived documents are 2
Total retrived documents are 1395
Out[518]:
{'Query1': [0.013888888888888888, 0.10344827586206896],
'Query3': [0.0, 0.0],
'Query15': [0.0014336917562724014, 0.6666666666666666]}
In [519]:
measures_dict
Out[519]:
{'Query1': [0.013888888888888888, 0.10344827586206896],
'Query3': [0.0, 0.0],
'Query15': [0.0014336917562724014, 0.6666666666666666]}
QID-71
In [520]:
score(71)
qid docid rel
0 71 569 1
1 71 571 1
2 71 1355 2
6 71 572 1
Total relevant documents are 9
Total relevant retrived documents are 4
Total retrived documents are 913
Out[520]:
{'Query1': [0.013888888888888888, 0.10344827586206896],
'Query3': [0.0, 0.0],
'Query15': [0.0014336917562724014, 0.6666666666666666],
'Query71': [0.004381161007667032, 0.4444444444444444]}
In [521]:
measures_dict
Out[521]:
{'Query1': [0.013888888888888888, 0.10344827586206896],
'Query3': [0.0, 0.0],
'Query15': [0.0014336917562724014, 0.6666666666666666],
'Query71': [0.004381161007667032, 0.4444444444444444]}
QID-2
In [522]:
score(2)
Empty DataFrame
Columns: [qid, docid, rel]
Index: []
Total relevant documents are 25
Total relevant retrived documents are 0
Total retrived documents are 99
Out[522]:
{'Query1': [0.013888888888888888, 0.10344827586206896],
'Query3': [0.0, 0.0],
'Query15': [0.0014336917562724014, 0.6666666666666666],
'Query71': [0.004381161007667032, 0.4444444444444444],
'Query2': [0.0, 0.0]}
In [523]:
measures_dict
Out[523]:
{'Query1': [0.013888888888888888, 0.10344827586206896],
'Query3': [0.0, 0.0],
'Query15': [0.0014336917562724014, 0.6666666666666666],
'Query71': [0.004381161007667032, 0.4444444444444444],
'Query2': [0.0, 0.0]}
QID-109
In [524]:
score(109)
qid docid rel
0 109 860 1
1 109 861 1
5 109 766 -1
Total relevant documents are 6
Total relevant retrived documents are 3
Total retrived documents are 1252
Out[524]:
{'Query1': [0.013888888888888888, 0.10344827586206896],
'Query3': [0.0, 0.0],
'Query15': [0.0014336917562724014, 0.6666666666666666],
'Query71': [0.004381161007667032, 0.4444444444444444],
'Query2': [0.0, 0.0],
'Query109': [0.0023961661341853034, 0.5]}
In [525]:
measures_dict
Out[525]:
{'Query1': [0.013888888888888888, 0.10344827586206896],
'Query3': [0.0, 0.0],
'Query15': [0.0014336917562724014, 0.6666666666666666],
'Query71': [0.004381161007667032, 0.4444444444444444],
'Query2': [0.0, 0.0],
'Query109': [0.0023961661341853034, 0.5]}
QID-6
In [526]:
score(6)
Empty DataFrame
Columns: [qid, docid, rel]
Index: []
Total relevant documents are 5
Total relevant retrived documents are 0
Total retrived documents are 10
Out[526]:
{'Query1': [0.013888888888888888, 0.10344827586206896],
'Query3': [0.0, 0.0],
'Query15': [0.0014336917562724014, 0.6666666666666666],
'Query71': [0.004381161007667032, 0.4444444444444444],
'Query2': [0.0, 0.0],
'Query109': [0.0023961661341853034, 0.5],
'Query6': [0.0, 0.0]}
QID-192
In [527]:
score(192)
qid docid rel
0 192 733 1
1 192 734 1
2 192 735 1
3 192 736 1
4 192 641 -1
Total relevant documents are 5
Total relevant retrived documents are 5
Total retrived documents are 1010
Out[527]:
{'Query1': [0.013888888888888888, 0.10344827586206896],
'Query3': [0.0, 0.0],
'Query15': [0.0014336917562724014, 0.6666666666666666],
'Query71': [0.004381161007667032, 0.4444444444444444],
'Query2': [0.0, 0.0],
'Query109': [0.0023961661341853034, 0.5],
'Query6': [0.0, 0.0],
'Query192': [0.0049504950495049506, 1.0]}
QID-204
In [528]:
score(204)
Empty DataFrame
Columns: [qid, docid, rel]
Index: []
Total relevant documents are 15
Total relevant retrived documents are 0
Total retrived documents are 27
Out[528]:
{'Query1': [0.013888888888888888, 0.10344827586206896],
'Query3': [0.0, 0.0],
'Query15': [0.0014336917562724014, 0.6666666666666666],
'Query71': [0.004381161007667032, 0.4444444444444444],
'Query2': [0.0, 0.0],
'Query109': [0.0023961661341853034, 0.5],
'Query6': [0.0, 0.0],
'Query192': [0.0049504950495049506, 1.0],
'Query204': [0.0, 0.0]}
In [529]:
# creating a dataframe with precision and recall values
values_df=pd.DataFrame.from_dict(measures_dict, orient='index',columns=["Precision","Recall
In [530]:
values_df
Out[530]:
Precision Recall
Query1 0.013889 0.103448
Query3 0.000000 0.000000
Query15 0.001434 0.666667
Query71 0.004381 0.444444
Query2 0.000000 0.000000
Query109 0.002396 0.500000
Query6 0.000000 0.000000
Query192 0.004950 1.000000
Query204 0.000000 0.000000
In [ ]:
In [ ]: