|
| 1 | +""" |
| 2 | +유사한 BI-LSTM 접근법입니다. |
| 3 | +static_bidirectional_rnn을 활용하였습니다. |
| 4 | +""" |
| 5 | +# coding: utf-8 |
| 6 | + |
| 7 | + |
| 8 | + |
| 9 | +import sys |
| 10 | +import tensorflow as tf |
| 11 | +import numpy as np |
| 12 | +import os |
| 13 | +import pandas as pd |
| 14 | + |
| 15 | +from tensorflow.python.keras import backend as K |
| 16 | +from tensorflow.python.keras.layers import Layer |
| 17 | + |
| 18 | +from sklearn.model_selection import train_test_split |
| 19 | + |
| 20 | +# from tensorflow.keras import backend as K |
| 21 | + |
| 22 | +import json |
| 23 | + |
| 24 | +tf.logging.set_verbosity(tf.logging.INFO) |
| 25 | + |
| 26 | +# # Initial global var |
| 27 | + |
| 28 | +# In[ ]: |
| 29 | + |
| 30 | + |
| 31 | +## 미리 Global 변수를 지정하자. 파일 명, 파일 위치, 디렉토리 등이 있다. |
| 32 | + |
| 33 | +DATA_IN_PATH = '../data_in/' |
| 34 | +DATA_OUT_PATH = '../data_out/' |
| 35 | + |
| 36 | +TRAIN_Q1_DATA_FILE = 'train_q1.npy' |
| 37 | +TRAIN_Q2_DATA_FILE = 'train_q2.npy' |
| 38 | +TRAIN_LABEL_DATA_FILE = 'train_label.npy' |
| 39 | +NB_WORDS_DATA_FILE = 'data_configs.json' |
| 40 | + |
| 41 | +# # Load Dataset |
| 42 | + |
| 43 | +# In[ ]: |
| 44 | + |
| 45 | + |
| 46 | +## 데이터를 불러오는 부분이다. 효과적인 데이터 불러오기를 위해, 미리 넘파이 형태로 저장시킨 데이터를 로드한다. |
| 47 | + |
| 48 | +q1_data = np.load(open(DATA_IN_PATH + TRAIN_Q1_DATA_FILE, 'rb')) |
| 49 | +q2_data = np.load(open(DATA_IN_PATH + TRAIN_Q2_DATA_FILE, 'rb')) |
| 50 | +labels = np.load(open(DATA_IN_PATH + TRAIN_LABEL_DATA_FILE, 'rb')) |
| 51 | +prepro_configs = None |
| 52 | + |
| 53 | +with open(DATA_IN_PATH + NB_WORDS_DATA_FILE, 'r') as f: |
| 54 | + prepro_configs = json.load(f) |
| 55 | + |
| 56 | +## 학습에 필요한 파라메터들에 대해서 지정하는 부분이다. |
| 57 | + |
| 58 | +print("# of dataset: {}".format(len(labels))) |
| 59 | + |
| 60 | +BATCH_SIZE = 4096 |
| 61 | +EPOCH = 20 |
| 62 | +HIDDEN = 64 |
| 63 | +BUFFER_SIZE = len(q1_data) |
| 64 | + |
| 65 | +NUM_LAYERS = 3 |
| 66 | +DROPOUT_RATIO = 0.3 |
| 67 | + |
| 68 | +TEST_SPLIT = 0.1 |
| 69 | +RNG_SEED = 13371447 |
| 70 | +EMBEDDING_DIM = 128 |
| 71 | +MAX_SEQ_LEN = 31 |
| 72 | + |
| 73 | +# In[ ]: |
| 74 | + |
| 75 | + |
| 76 | +VOCAB_SIZE = prepro_configs['vocab_size'] |
| 77 | + |
| 78 | + |
| 79 | +# # Split train and test dataset |
| 80 | + |
| 81 | +# In[ ]: |
| 82 | + |
| 83 | + |
| 84 | +q1_data_len = np.array([min(len(x), MAX_SEQ_LEN) for x in q1_data]) |
| 85 | +q2_data_len = np.array([min(len(x), MAX_SEQ_LEN) for x in q2_data]) |
| 86 | + |
| 87 | +# In[ ]: |
| 88 | + |
| 89 | + |
| 90 | +## 데이터를 나누어 저장하자. sklearn의 train_test_split을 사용하면 유용하다. 하지만, 쿼라 데이터의 경우는 |
| 91 | +## 입력이 1개가 아니라 2개이다. 따라서, np.stack을 사용하여 두개를 하나로 쌓은다음 활용하여 분류한다. |
| 92 | + |
| 93 | +X = np.stack((q1_data, q2_data), axis=1) |
| 94 | +y = labels |
| 95 | +train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=TEST_SPLIT, random_state=RNG_SEED) |
| 96 | + |
| 97 | +train_Q1 = train_X[:,0] |
| 98 | +train_Q2 = train_X[:,1] |
| 99 | +test_Q1 = test_X[:,0] |
| 100 | +test_Q2 = test_X[:,1] |
| 101 | + |
| 102 | + |
| 103 | +# In[ ]: |
| 104 | + |
| 105 | + |
| 106 | +def rearrange(base, hypothesis, labels): |
| 107 | + features = {"base": base, "hypothesis": hypothesis} |
| 108 | + return features, labels |
| 109 | + |
| 110 | +def train_input_fn(): |
| 111 | + dataset = tf.data.Dataset.from_tensor_slices((train_Q1, train_Q2, train_y)) |
| 112 | + dataset = dataset.shuffle(buffer_size=len(train_Q1)) |
| 113 | + dataset = dataset.batch(BATCH_SIZE) |
| 114 | + dataset = dataset.map(rearrange) |
| 115 | + dataset = dataset.repeat(EPOCH) |
| 116 | + iterator = dataset.make_one_shot_iterator() |
| 117 | + |
| 118 | + return iterator.get_next() |
| 119 | + |
| 120 | +def eval_input_fn(): |
| 121 | + dataset = tf.data.Dataset.from_tensor_slices((test_Q1, test_Q2, test_y)) |
| 122 | + dataset = dataset.batch(BATCH_SIZE) |
| 123 | + dataset = dataset.map(rearrange) |
| 124 | + iterator = dataset.make_one_shot_iterator() |
| 125 | + |
| 126 | + return iterator.get_next() |
| 127 | + |
| 128 | +class ManDist(Layer): |
| 129 | + """ |
| 130 | + Keras Custom Layer that calculates Manhattan Distance. |
| 131 | + """ |
| 132 | + |
| 133 | + # initialize the layer, No need to include inputs parameter! |
| 134 | + def __init__(self, **kwargs): |
| 135 | + self.result = None |
| 136 | + super(ManDist, self).__init__(**kwargs) |
| 137 | + |
| 138 | + # input_shape will automatic collect input shapes to build layer |
| 139 | + def build(self, input_shape): |
| 140 | + super(ManDist, self).build(input_shape) |
| 141 | + |
| 142 | + # This is where the layer's logic lives. |
| 143 | + def call(self, x, **kwargs): |
| 144 | + self.result = K.exp(-K.sum(K.abs(x[0] - x[1]), axis=1, keepdims=True)) |
| 145 | + return self.result |
| 146 | + |
| 147 | + # return output shape |
| 148 | + def compute_output_shape(self, input_shape): |
| 149 | + return K.int_shape(self.result) |
| 150 | + |
| 151 | + |
| 152 | +def BiRNN(x, dropout, scope, hidden_units): |
| 153 | + n_hidden = hidden_units |
| 154 | + n_layers = 3 |
| 155 | + # Prepare data shape to match `static_rnn` function requirements |
| 156 | + x = tf.unstack(tf.transpose(x, perm=[1, 0, 2])) |
| 157 | + print(x) |
| 158 | + # Define lstm cells with tensorflow |
| 159 | + # Forward direction cell |
| 160 | + with tf.name_scope("fw" + scope), tf.variable_scope("fw" + scope): |
| 161 | + stacked_rnn_fw = [] |
| 162 | + for _ in range(n_layers): |
| 163 | + fw_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True) |
| 164 | + lstm_fw_cell = tf.contrib.rnn.DropoutWrapper(fw_cell, output_keep_prob=dropout) |
| 165 | + stacked_rnn_fw.append(lstm_fw_cell) |
| 166 | + lstm_fw_cell_m = tf.nn.rnn_cell.MultiRNNCell(cells=stacked_rnn_fw, state_is_tuple=True) |
| 167 | + |
| 168 | + with tf.name_scope("bw" + scope), tf.variable_scope("bw" + scope): |
| 169 | + stacked_rnn_bw = [] |
| 170 | + for _ in range(n_layers): |
| 171 | + bw_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True) |
| 172 | + lstm_bw_cell = tf.contrib.rnn.DropoutWrapper(bw_cell, output_keep_prob=dropout) |
| 173 | + stacked_rnn_bw.append(lstm_bw_cell) |
| 174 | + lstm_bw_cell_m = tf.nn.rnn_cell.MultiRNNCell(cells=stacked_rnn_bw, state_is_tuple=True) |
| 175 | + # Get lstm cell output |
| 176 | + |
| 177 | + with tf.name_scope("bw" + scope), tf.variable_scope("bw" + scope): |
| 178 | + outputs, _, _ = tf.nn.static_bidirectional_rnn(lstm_fw_cell_m, lstm_bw_cell_m, x, dtype=tf.float32) |
| 179 | + return outputs[-1] |
| 180 | + |
| 181 | + |
| 182 | +# # Model setup |
| 183 | + |
| 184 | +def Malstm(features, labels, mode): |
| 185 | + |
| 186 | + TRAIN = mode == tf.estimator.ModeKeys.TRAIN |
| 187 | + EVAL = mode == tf.estimator.ModeKeys.EVAL |
| 188 | + PREDICT = mode == tf.estimator.ModeKeys.PREDICT |
| 189 | + |
| 190 | + embedding = tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM) |
| 191 | + |
| 192 | + base_embedded_matrix = embedding(features['base']) |
| 193 | + hypothesis_embedde
10000
d_matrix = embedding(features['hypothesis']) |
| 194 | + |
| 195 | + base_sementic_matrix = BiRNN(base_embedded_matrix, DROPOUT_RATIO, 'base', HIDDEN) |
| 196 | + hypothesis_sementic_matrix = BiRNN(hypothesis_embedded_matrix, DROPOUT_RATIO, 'hypothesis', HIDDEN) |
| 197 | + |
| 198 | + logit_layer = ManDist()([base_sementic_matrix, hypothesis_sementic_matrix]) |
| 199 | + logit_layer = tf.squeeze(logit_layer, axis=-1) |
| 200 | + |
| 201 | + # self._ma_dist([q1_lstm, q2_lstm]) |
| 202 | + |
| 203 | + # logit_layer = tf.exp(-tf.reduce_sum(tf.abs(base_sementic_matrix - hypothesis_sementic_matrix), axis=1, keepdims=True)) |
| 204 | + # logit_layer = tf.squeeze(logit_layer, axis=-1) |
| 205 | + # |
| 206 | + if PREDICT: |
| 207 | + return tf.estimator.EstimatorSpec( |
| 208 | + mode=mode, |
| 209 | + predictions={ |
| 210 | + 'is_duplicate':logit_layer |
| 211 | + }) |
| 212 | + |
| 213 | + #prediction 진행 시, None |
| 214 | + if labels is not None: |
| 215 | + labels = tf.to_float(labels) |
| 216 | + |
| 217 | +# loss = tf.reduce_mean(tf.keras.metrics.binary_crossentropy(y_true=labels, y_pred=logit_layer)) |
| 218 | + loss = tf.losses.mean_squared_error(labels=labels, predictions=logit_layer) |
| 219 | +# loss = tf.reduce_mean(tf.losses.sigmoid_cross_entropy(labels, logit_layer)) |
| 220 | + |
| 221 | + if EVAL: |
| 222 | + accuracy = tf.metrics.accuracy(labels, tf.round(logit_layer)) |
| 223 | + eval_metric_ops = {'acc': accuracy} |
| 224 | + return tf.estimator.EstimatorSpec( |
| 225 | + mode=mode, |
| 226 | + eval_metric_ops= eval_metric_ops, |
| 227 | + loss=loss) |
| 228 | + |
| 229 | + elif TRAIN: |
| 230 | + |
| 231 | + global_step = tf.train.get_global_step() |
| 232 | + train_op = tf.train.AdamOptimizer(1e-3).minimize(loss, global_step) |
| 233 | + |
| 234 | + return tf.estimator.EstimatorSpec( |
| 235 | + mode=mode, |
| 236 | + train_op=train_op, |
| 237 | + loss=loss) |
| 238 | + |
| 239 | + |
| 240 | +# # Training & Eval |
| 241 | + |
| 242 | +# In[ ]: |
| 243 | + |
| 244 | + |
| 245 | +os.environ["CUDA_VISIBLE_DEVICES"]="7" #For TEST |
| 246 | + |
| 247 | +model_dir = os.path.join(os.getcwd(), DATA_OUT_PATH + "/checkpoint/rnn2/") |
| 248 | +os.makedirs(model_dir, exist_ok=True) |
| 249 | + |
| 250 | +config_tf = tf.estimator.RunConfig() |
| 251 | + |
| 252 | +lstm_est = tf.estimator.Estimator(Malstm, model_dir=model_dir) |
| 253 | + |
| 254 | + |
| 255 | +# In[ ]: |
| 256 | + |
| 257 | + |
| 258 | +lstm_est.train(train_input_fn) |
| 259 | + |
| 260 | + |
| 261 | +# In[ ]: |
| 262 | + |
| 263 | + |
| 264 | +lstm_est.evaluate(eval_input_fn) |
| 265 | + |
| 266 | + |
| 267 | +# # Load test dataset |& create submit dataset to kaggle |
| 268 | + |
| 269 | +# In[ ]: |
| 270 | + |
| 271 | + |
| 272 | +TEST_Q1_DATA_FILE = 'test_q1.npy' |
| 273 | +TEST_Q2_DATA_FILE = 'test_q2.npy' |
| 274 | +TEST_ID_DATA_FILE = 'test_id.npy' |
| 275 | + |
| 276 | +test_q1_data = np.load(open(DATA_IN_PATH + TEST_Q1_DATA_FILE, 'rb')) |
| 277 | +test_q2_data = np.load(open(DATA_IN_PATH + TEST_Q2_DATA_FILE, 'rb')) |
| 278 | +test_id_data = np.load(open(DATA_IN_PATH + TEST_ID_DATA_FILE, 'rb')) |
| 279 | + |
| 280 | + |
| 281 | +# In[ ]: |
| 282 | + |
| 283 | + |
| 284 | +predict_input_fn = tf.estimator.inputs.numpy_input_fn(x={"base":test_q1_data, |
| 285 | + "hypothesis":test_q2_data}, |
| 286 | + shuffle=False) |
| 287 | + |
| 288 | +predictions = np.array([p['is_duplicate'] for p in lstm_est.predict(input_fn= |
| 289 | +predict_input_fn)]) |
| 290 | + |
| 291 | + |
| 292 | +# In[ ]: |
| 293 | + |
| 294 | + |
| 295 | +print(len(predictions)) #2345796 |
| 296 | + |
| 297 | +output = pd.DataFrame( data={"test_id":test_id_data, "is_duplicate": list(predictions)} ) |
| 298 | +output.to_csv( "rnn_predict.csv", index=False, quoting=3 ) |
| 299 | + |
0 commit comments