@@ -135,7 +135,7 @@ def get_data(
135
135
is_cls = False , is_meta = False ):
136
136
tqdm_notebook = tqdm
137
137
if label2idx is None :
138
- label2idx = {pad : 0 , '[CLS]' : 1 , '[SEP]' : 2 }
138
+ label2idx = {pad : 0 , '[CLS]' : 1 }
139
139
features = []
140
140
all_args = []
141
141
if is_cls :
@@ -176,15 +176,15 @@ def get_data(
176
176
labels = str (labels ).split ()
177
177
pad_idx = label2idx [pad ]
178
178
assert len (orig_tokens ) == len (labels )
179
- prev_label = ""
179
+ # prev_label = ""
180
180
for idx_ , (orig_token , label ) in enumerate (zip (orig_tokens , labels )):
181
181
# Fix BIO to IO as BERT proposed https://arxiv.org/pdf/1810.04805.pdf
182
182
prefix = "I_"
183
183
if label != "O" :
184
184
label = label .split ("_" )[1 ]
185
- prev_label = label
186
- else :
187
- prev_label = label
185
+ # prev_label = label
186
+ # else:
187
+ # prev_label = label
188
188
189
189
cur_tokens = tokenizer .tokenize (orig_token )
190
190
if max_seq_len - 1 < len (bert_tokens ) + len (cur_tokens ):
@@ -196,11 +196,12 @@ def get_data(
196
196
# ["I_" + label] * (len(cur_tokens) - 1)
197
197
bert_label = [prefix + label ] + ["X" ] * (len (cur_tokens ) - 1 )
198
198
bert_labels .extend (bert_label )
199
- bert_tokens .append ("[SEP]" )
200
- bert_labels .append ("[SEP]" )
199
+ # bert_tokens.append("[SEP]")
200
+ # bert_labels.append("[SEP]")
201
201
if is_meta :
202
202
meta_tokens .append ([0 ] * len (meta [0 ]))
203
- orig_tokens = ["[CLS]" ] + orig_tokens + ["[SEP]" ]
203
+ # + ["[SEP]"]
204
+ orig_tokens = ["[CLS]" ] + orig_tokens
204
205
205
206
input_ids = tokenizer .convert_tokens_to_ids (bert_tokens )
206
207
labels = bert_labels
0 commit comments