NLP-kr
diff --git a/‎3.NLP_INTRO/3.3_텍스트_유사도.ipynb
Lines changed: 139 additions & 0 deletions b/‎3.NLP_INTRO/3.3_텍스트_유사도.ipynb
Lines changed: 139 additions & 0 deletions
@@ -0,0 +1,139 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 문장 백터화 진행"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "\n",
+    "sent = (\"휴일 인 오늘 도 서쪽 을 중심 으로 폭염 이 이어졌는데요, 내일 은 반가운 비 소식 이 있습니다.\", \"폭염 을 피해서 휴일 에 놀러왔다가 갑작스런 비 로 인해 망연자실 하고 있습니 다.\") \n",
+    "tfidf_vectorizer = TfidfVectorizer()\n",
+    "tfidf_matrix = tfidf_vectorizer.fit_transform(sent) #문장 벡터화 진행\n",
+    "\n",
+    "idf = tfidf_vectorizer.idf_\n",
+    "print(dict(zip(tfidf_vectorizer.get_feature_names(), idf)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 유사도의 예\n",
+    " 1. 자카드 유사도\n",
+    " 2. 코사인 유사도\n",
+    " 3. 유클리디안 유사도\n",
+    " 4. 멘하탄 유사도"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 1. 자카드 유사도\n",
+    "from sklearn.metrics import jaccard_similarity_score\n",
+    "\n",
+    "# jaccard_similarity_score(tfidf_matrix[0:1], tfidf_matrix[1:2])\n",
+    "jaccard_similarity_score(np.array([1,1,0,0]), np.array([1,1,0,2]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 2. 코사인 유사도\n",
+    "\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "\n",
+    "# 코사인 유사도를 구해보자\n",
+    "cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 3. 유클리디안 유사도\n",
+    "\n",
+    "from sklearn.metrics.pairwise import euclidean_distances\n",
+    "\n",
+    "euclidean_distances(tfidf_matrix[0:1], tfidf_matrix[1:2])\n",
+    "\n",
+    "# 정규화\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "def l1_normalize(v):\n",
+    "    norm = np.sum(v)\n",
+    "    return v / norm\n",
+    "\n",
+    "tfidf_norm_l1 = l1_normalize(tfidf_matrix)\n",
+    "euclidean_distances(tfidf_norm_l1[0:1], tfidf_norm_l1[1:2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 4. 맨하탄 유사도\n",
+    "\n",
+    "from sklearn.metrics.pairwise import manhattan_distances\n",
+    "\n",
+    "manhattan_distances(tfidf_norm_l1[0:1], tfidf_norm_l1[1:2])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}