8000 resolved file issue · NLP-kr/tensorflow-ml-nlp@d5a6265 · GitHub
[go: up one dir, main page]

Skip to content

Commit d5a6265

Browse files
committed
resolved file issue
1 parent 1f3e103 commit d5a6265

File tree

1 file changed

+139
-0
lines changed

1 file changed

+139
-0
lines changed
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# 문장 백터화 진행"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": null,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"import numpy as np"
17+
]
18+
},
19+
{
20+
"cell_type": "code",
21+
"execution_count": null,
22+
"metadata": {},
23+
"outputs": [],
24+
"source": [
25+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
26+
"\n",
27+
"sent = (\"휴일 인 오늘 도 서쪽 을 중심 으로 폭염 이 이어졌는데요, 내일 은 반가운 비 소식 이 있습니다.\", \"폭염 을 피해서 휴일 에 놀러왔다가 갑작스런 비 로 인해 망연자실 하고 있습니 다.\") \n",
28+
"tfidf_vectorizer = TfidfVectorizer()\n",
29+
"tfidf_matrix = tfidf_vectorizer.fit_transform(sent) #문장 벡터화 진행\n",
30+
"\n",
31+
"idf = tfidf_vectorizer.idf_\n",
32+
"print(dict(zip(tfidf_vectorizer.get_feature_names(), idf)))"
33+
]
34+
},
35+
{
36+
"cell_type": "markdown",
37+
"metadata": {},
38+
"source": [
39+
"# 유사도의 예\n",
40+
" 1. 자카드 유사도\n",
41+
" 2. 코사인 유사도\n",
42+
" 3. 유클리디안 유사도\n",
43+
" 4. 멘하탄 유사도"
44+
]
45+
},
46+
{
47+
"cell_type": "code",
48+
"execution_count": null,
49+
"metadata": {},
50+
"outputs": [],
51+
"source": [
52+
"# 1. 자카드 유사도\n",
53+
"from sklearn.metrics import jaccard_similarity_score\n",
54+
"\n",
55+
"# jaccard_similarity_score(tfidf_matrix[0:1], tfidf_matrix[1:2])\n",
56+
"jaccard_similarity_score(np.array([1,1,0,0]), np.array([1,1,0,2]))"
57+
]
58+
},
59+
{
60+
"cell_type": "code",
61+
"execution_count": null,
62+
"metadata": {},
63+
"outputs": [],
64+
"source": [
65+
"# 2. 코사인 유사도\n",
66+
"\n",
67+
"from sklearn.metrics.pairwise import cosine_similarity\n",
68+
"\n",
69+
"# 코사인 유사도를 구해보자\n",
70+
"cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])"
71+
]
72+
},
73+
{
74+
"cell_type": "code",
75+
"execution_count": null,
76+
"metadata": {},
77+
"outputs": [],
78+
"source": [
79+
"# 3. 유클리디안 유사도\n",
80+
"\n",
81+
"from sklearn.metrics.pairwise import euclidean_distances\n",
82+
"\n",
83+
"euclidean_distances(tfidf_matrix[0:1], tfidf_matrix[1:2])\n",
84+
"\n",
85+
"# 정규화\n",
86+
"\n",
87+
"import numpy as np\n",
88+
"\n",
89+
"def l1_normalize(v):\n",
90+
" norm = np.sum(v)\n",
91+
" return v / norm\n",
92+
"\n",
93+
"tfidf_norm_l1 = l1_normalize(tfidf_matrix)\n",
94+
"euclidean_distances(tfidf_norm_l1[0:1], tfidf_norm_l1[1:2])"
95+
]
96+
},
97+
{
98+
"cell_type": "code",
99+
"execution_count": null,
100+
"metadata": {},
101+
"outputs": [],
102+
"source": [
103+
"# 4. 맨하탄 유사도\n",
104+
"\n",
105+
"from sklearn.metrics.pairwise import manhattan_distances\n",
106+
"\n",
107+
"manhattan_distances(tfidf_norm_l1[0:1], tfidf_norm_l1[1:2])"
108+
]
109+
},
110+
{
111+
"cell_type": "code",
112+
"execution_count": null,
113+
"metadata": {},
114+
"outputs": [],
115+
"source": []
116+
}
117+
],
118+
"metadata": {
119+
"kernelspec": {
120+
"display_name": "Python 3",
121+
"language": "python",
122+
"name": "python3"
123+
},
124+
"language_info": {
125+
"codemirror_mode": {
126+
"name": "ipython",
127+
"version": 3
128+
},
129+
"file_extension": ".py",
130+
"mimetype": "text/x-python",
131+
"name": "python",
132+
"nbconvert_exporter": "python",
133+
"pygments_lexer": "ipython3",
134+
"version": "3.6.8"
135+
}
136+
},
137+
"nbformat": 4,
138+
"nbformat_minor": 2
139+
}

0 commit comments

Comments
 (0)
0