10000 update for batch size and different type lstm · NLP-kr/tensorflow-ml-nlp@1b669da · GitHub
[go: up one dir, main page]

Skip to content

Commit 1b669da

Browse files
committed
update for batch size and different type lstm
1 parent 15d9df4 commit 1b669da

File tree

2 files changed

+307
-8
lines changed

2 files changed

+307
-8
lines changed

5.TEXT_SIM/Appendix/5.3.3_Quora_LSTM_Appendix.ipynb

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,10 @@
4343
"NB_WORDS_DATA_FILE = 'data_configs.json'\n",
4444
"\n",
4545
"## 학습에 필요한 파라메터들에 대해서 지정하는 부분이다.\n",
46-
"\n",
47-
"BATCH_SIZE = 16\n",
48-
"EPOCH = 2\n",
46+
"## CPU에서는 Epoch 크기를 줄이는 걸 권장한다.\n",
47+
"BATCH_SIZE = 4096\n",
48+
"EPOCH = 50\n",
4949
"HIDDEN = 64\n",
50-
"BUFFER_SIZE = 10000\n",
5150
"\n",
5251
"NUM_LAYERS = 3\n",
5352
"DROPOUT_RATIO = 0.3\n",
@@ -88,7 +87,8 @@
8887
"metadata": {},
8988
"outputs": [],
9089
"source": [
91-
"VOCAB_SIZE = prepro_configs['vocab_size']"
90+
"VOCAB_SIZE = prepro_configs['vocab_size']\n",
91+
"BUFFER_SIZE = len(labels)"
9292
]
9393
},
9494
{
@@ -212,7 +212,7 @@
212212
"# merged_matrix = tf.concat([base_sementic_matrix, hypothesis_sementic_matrix], -1)\n",
213213
"# logit_layer = tf.keras.layers.dot([base_sementic_matrix, hypothesis_sementic_matrix], axes=1, normalize=True) \n",
214214
"# logit_layer = K.exp(-K.sum(K.abs(base_sementic_matrix - hypothesis_sementic_matrix), axis=1, keepdims=True))\n",
215-
"\n",
215+
" \n",
216216
" logit_layer = tf.exp(-tf.reduce_sum(tf.abs(base_sementic_matrix - hypothesis_sementic_matrix), axis=1, keepdims=True))\n",
217217
" logit_layer = tf.squeeze(logit_layer, axis=-1)\n",
218218
" \n",
@@ -263,7 +263,7 @@
263263
"metadata": {},
264264
"outputs": [],
265265
"source": [
266-
"os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\" #For TEST\n",
266+
"os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"7\" #For GPU\n",
267267
"\n",
268268
"model_dir = os.path.join(os.getcwd(), DATA_OUT_PATH + \"/checkpoint/rnn2/\")\n",
269269
"os.makedirs(model_dir, exist_ok=True)\n",
@@ -356,7 +356,7 @@
356356
"name": "python",
357357
"nbconvert_exporter": "python",
358358
"pygments_lexer": "ipython3",
359-
"version": "3.6.8"
359+
"version": "3.6.5"
360360
}
361361
},
362362
"nbformat": 4,
Lines changed: 299 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,299 @@
1+
"""
2+
유사한 BI-LSTM 접근법입니다.
3+
static_bidirectional_rnn을 활용하였습니다.
4+
"""
5+
# coding: utf-8
6+
7+
8+
9+
import sys
10+
import tensorflow as tf
11+
import numpy as np
12+
import os
13+
import pandas as pd
14+
15+
from tensorflow.python.keras import backend as K
16+
from tensorflow.python.keras.layers import Layer
17+
18+
from sklearn.model_selection import train_test_split
19+
20+
# from tensorflow.keras import backend as K
21+
22+
import json
23+
24+
tf.logging.set_verbosity(tf.logging.INFO)
25+
26+
# # Initial global var
27+
28+
# In[ ]:
29+
30+
31+
## 미리 Global 변수를 지정하자. 파일 명, 파일 위치, 디렉토리 등이 있다.
32+
33+
DATA_IN_PATH = '../data_in/'
34+
DATA_OUT_PATH = '../data_out/'
35+
36+
TRAIN_Q1_DATA_FILE = 'train_q1.npy'
37+
TRAIN_Q2_DATA_FILE = 'train_q2.npy'
38+
TRAIN_LABEL_DATA_FILE = 'train_label.npy'
39+
NB_WORDS_DATA_FILE = 'data_configs.json'
40+
41+
# # Load Dataset
42+
43+
# In[ ]:
44+
45+
46+
## 데이터를 불러오는 부분이다. 효과적인 데이터 불러오기를 위해, 미리 넘파이 형태로 저장시킨 데이터를 로드한다.
47+
48+
q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb'))
49+
q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb'))
50+
labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb'))
51+
prepro_configs = None
52+
53+
with open(DATA_IN_PATH + NB_WORDS_DATA_FILE, 'r') as f:
54+
prepro_configs = json.load(f)
55+
56+
## 학습에 필요한 파라메터들에 대해서 지정하는 부분이다.
57+
58+
print("# of dataset: {}".format(len(labels)))
59+
60+
BATCH_SIZE = 4096
61+
EPOCH = 20
62+
HIDDEN = 64
63+
BUFFER_SIZE = len(q1_data)
64+
65+
NUM_LAYERS = 3
66+
DROPOUT_RATIO = 0.3
67+
68+
TEST_SPLIT = 0.1
69+
RNG_SEED = 13371447
70+
EMBEDDING_DIM = 128
71+
MAX_SEQ_LEN = 31
72+
73+
# In[ ]:
74+
75+
76+
VOCAB_SIZE = prepro_configs['vocab_size']
77+
78+
79+
# # Split train and test dataset
80+
81+
# In[ ]:
82+
83+
84+
q1_data_len = np.array([min(len(x), MAX_SEQ_LEN) for x in q1_data])
85+
q2_data_len = np.array([min(len(x), MAX_SEQ_LEN) for x in q2_data])
86+
87+
# In[ ]:
88+
89+
90+
## 데이터를 나누어 저장하자. sklearn의 train_test_split을 사용하면 유용하다. 하지만, 쿼라 데이터의 경우는
91+
## 입력이 1개가 아니라 2개이다. 따라서, np.stack을 사용하여 두개를 하나로 쌓은다음 활용하여 분류한다.
92+
93+
X = np.stack((q1_data, q2_data), axis=1)
94+
y = labels
95+
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RNG_SEED)
96+
97+
train_Q1 = train_X[:,0]
98+
train_Q2 = train_X[:,1]
99+
test_Q1 = test_X[:,0]
100+
test_Q2 = test_X[:,1]
101+
102+
103+
# In[ ]:
104+
105+
106+
def rearrange(base, hypothesis, labels):
107+
features = {"base": base, "hypothesis": hypothesis}
108+
return features, labels
109+
110+
def train_input_fn():
111+
dataset = tf.data.Dataset.from_tensor_slices((train_Q1, train_Q2, train_y))
112+
dataset = dataset.shuffle(buffer_size=len(train_Q1))
113+
dataset = dataset.batch(BATCH_SIZE)
114+
dataset = dataset.map(rearrange)
115+
dataset = dataset.repeat(EPOCH)
116+
iterator = dataset.make_one_shot_iterator()
117+
118+
return iterator.get_next()
119+
120+
def eval_input_fn():
121+
dataset = tf.data.Dataset.from_tensor_slices((test_Q1, test_Q2, test_y))
122+
dataset = dataset.batch(BATCH_SIZE)
123+
dataset = dataset.map(rearrange)
124+
iterator = dataset.make_one_shot_iterator()
125+
126+
return iterator.get_next()
127+
128+
class ManDist(Layer):
129+
"""
130+
Keras Custom Layer that calculates Manhattan Distance.
131+
"""
132+
133+
# initialize the layer, No need to include inputs parameter!
134+
def __init__(self, **kwargs):
135+
self.result = None
136+
super(ManDist, self).__init__(**kwargs)
137+
138+
# input_shape will automatic collect input shapes to build layer
139+
def build(self, input_shape):
140+
super(ManDist, self).build(input_shape)
141+
142+
# This is where the layer's logic lives.
143+
def call(self, x, **kwargs):
144+
self.result = K.exp(-K.sum(K.abs(x[0] - x[1]), axis=1, keepdims=True))
145+
return self.result
146+
147+
# return output shape
148+
def compute_output_shape(self, input_shape):
149+
return K.int_shape(self.result)
150+
151+
152+
def BiRNN(x, dropout, scope, hidden_units):
153+
n_hidden = hidden_units
154+
n_layers = 3
155+
# Prepare data shape to match `static_rnn` function requirements
156+
x = tf.unstack(tf.transpose(x, perm=[1, 0, 2]))
157+
print(x)
158+
# Define lstm cells with tensorflow
159+
# Forward direction cell
160+
with tf.name_scope("fw" + scope), tf.variable_scope("fw" + scope):
161+
stacked_rnn_fw = []
162+
for _ in range(n_layers):
163+
fw_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True)
164+
lstm_fw_cell = tf.contrib.rnn.DropoutWrapper(fw_cell, output_keep_prob=dropout)
165+
stacked_rnn_fw.append(lstm_fw_cell)
166+
lstm_fw_cell_m = tf.nn.rnn_cell.MultiRNNCell(cells=stacked_rnn_fw, state_is_tuple=True)
167+
168+
with tf.name_scope("bw" + scope), tf.variable_scope("bw" + scope):
169+
stacked_rnn_bw = []
170+
for _ in range(n_layers):
171+
bw_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True)
172+
lstm_bw_cell = tf.contrib.rnn.DropoutWrapper(bw_cell, output_keep_prob=dropout)
173+
stacked_rnn_bw.append(lstm_bw_cell)
174+
lstm_bw_cell_m = tf.nn.rnn_cell.MultiRNNCell(cells=stacked_rnn_bw, state_is_tuple=True)
175+
# Get lstm cell output
176+
177+
with tf.name_scope("bw" + scope), tf.variable_scope("bw" + scope):
178+
outputs, _, _ = tf.nn.static_bidirectional_rnn(lstm_fw_cell_m, lstm_bw_cell_m, x, dtype=tf.float32)
179+
return outputs[-1]
180+
181+
182+
# # Model setup
183+
184+
def Malstm(features, labels, mode):
185+
186+
TRAIN = mode == tf.estimator.ModeKeys.TRAIN
187+
EVAL = mode == tf.estimator.ModeKeys.EVAL
188+
PREDICT = mode == tf.estimator.ModeKeys.PREDICT
189+
190+
embedding = tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
191+
192+
base_embedded_matrix = embedding(features['base'])
193+
hypothesis_embedde 10000 d_matrix = embedding(features['hypothesis'])
194+
195+
base_sementic_matrix = BiRNN(base_embedded_matrix, DROPOUT_RATIO, 'base', HIDDEN)
196+
hypothesis_sementic_matrix = BiRNN(hypothesis_embedded_matrix, DROPOUT_RATIO, 'hypothesis', HIDDEN)
197+
198+
logit_layer = ManDist()([base_sementic_matrix, hypothesis_sementic_matrix])
199+
logit_layer = tf.squeeze(logit_layer, axis=-1)
200+
201+
# self._ma_dist([q1_lstm, q2_lstm])
202+
203+
# logit_layer = tf.exp(-tf.reduce_sum(tf.abs(base_sementic_matrix - hypothesis_sementic_matrix), axis=1, keepdims=True))
204+
# logit_layer = tf.squeeze(logit_layer, axis=-1)
205+
#
206+
if PREDICT:
207+
return tf.estimator.EstimatorSpec(
208+
mode=mode,
209+
predictions={
210+
'is_duplicate':logit_layer
211+
})
212+
213+
#prediction 진행 시, None
214+
if labels is not None:
215+
labels = tf.to_float(labels)
216+
217+
# loss = tf.reduce_mean(tf.keras.metrics.binary_crossentropy(y_true=labels, y_pred=logit_layer))
218+
loss = tf.losses.mean_squared_error(labels=labels, predictions=logit_layer)
219+
# loss = tf.reduce_mean(tf.losses.sigmoid_cross_entropy(labels, logit_layer))
220+
221+
if EVAL:
222+
accuracy = tf.metrics.accuracy(labels, tf.round(logit_layer))
223+
eval_metric_ops = {'acc': accuracy}
224+
return tf.estimator.EstimatorSpec(
225+
mode=mode,
226+
eval_metric_ops= eval_metric_ops,
227+
loss=loss)
228+
229+
elif TRAIN:
230+
231+
global_step = tf.train.get_global_step()
232+
train_op = tf.train.AdamOptimizer(1e-3).minimize(loss, global_step)
233+
234+
return tf.estimator.EstimatorSpec(
235+
mode=mode,
236+
train_op=train_op,
237+
loss=loss)
238+
239+
240+
# # Training & Eval
241+
242+
# In[ ]:
243+
244+
245+
os.environ["CUDA_VISIBLE_DEVICES"]="7" #For TEST
246+
247+
model_dir = os.path.join(os.getcwd(), DATA_OUT_PATH + "/checkpoint/rnn2/")
248+
os.makedirs(model_dir, exist_ok=True)
249+
250+
config_tf = tf.estimator.RunConfig()
251+
252+
lstm_est = tf.estimator.Estimator(Malstm, model_dir=model_dir)
253+
254+
255+
# In[ ]:
256+
257+
258+
lstm_est.train(train_input_fn)
259+
260+
261+
# In[ ]:
262+
263+
264+
lstm_est.evaluate(eval_input_fn)
265+
266+
267+
# # Load test dataset |& create submit dataset to kaggle
268+
269+
# In[ ]:
270+
271+
272+
TEST_Q1_DATA_FILE = 'test_q1.npy'
273+
TEST_Q2_DATA_FILE = 'test_q2.npy'
274+
TEST_ID_DATA_FILE = 'test_id.npy'
275+
276+
test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'rb'))
277+
test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'rb'))
278+
test_id_data = np.load(open(DATA_IN_PATH + TEST_ID_DATA_FILE, 'rb'))
279+
280+
281+
# In[ ]:
282+
283+
284+
predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"base":test_q1_data,
285+
"hypothesis":test_q2_data},
286+
shuffle=False)
287+
288+
predictions = np.array([p['is_duplicate'] for p in lstm_est.predict(input_fn=
289+
predict_input_fn)])
290+
291+
292+
# In[ ]:
293+
294+
295+
print(len(predictions)) #2345796
296+
297+
output = pd.DataFrame( data={"test_id":test_id_data, "is_duplicate": list(predictions)} )
298+
output.to_csv( "rnn_predict.csv", index=False, quoting=3 )
299+

0 commit comments

Comments
 (0)
0