8000 remove sep from data · flypythoncom/ner-bert@29cc4bc · GitHub
[go: up one dir, main page]

Skip to content

Commit 29cc4bc

Browse files
committed
remove sep from data
1 parent 0f37d17 commit 29cc4bc

File tree

2 files changed

+10
-17
lines changed

2 files changed

+10
-17
lines changed

modules/data/bert_data.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ def get_data(
135135
is_cls=False, is_meta=False):
136136
tqdm_notebook = tqdm
137137
if label2idx is None:
138-
label2idx = {pad: 0, '[CLS]': 1, '[SEP]': 2}
138+
label2idx = {pad: 0, '[CLS]': 1}
139139
features = []
140140
all_args = []
141141
if is_cls:
@@ -176,15 +176,15 @@ def get_data(
176176
labels = str(labels).split()
177177
pad_idx = label2idx[pad]
178178
assert len(orig_tokens) == len(labels)
179-
prev_label = ""
179+
# prev_label = ""
180180
for idx_, (orig_token, label) in enumerate(zip(orig_tokens, labels)):
181181
# Fix BIO to IO as BERT proposed https://arxiv.org/pdf/1810.04805.pdf
182182
prefix = "I_"
183183
if label != "O":
184184
label = label.split("_")[1]
185-
prev_label = label
186-
else:
187-
prev_label = label
185+
# prev_label = label
186+
# else:
187+
# prev_label = label
188188

189189
cur_tokens = tokenizer.tokenize(orig_token)
190190
if max_seq_len - 1 < len(bert_tokens) + len(cur_tokens):
@@ -196,11 +196,12 @@ def get_data(
196196
# ["I_" + label] * (len(cur_tokens) - 1)
197197
bert_label = [prefix + label] + ["X"] * (len(cur_tokens) - 1)
198198
bert_labels.extend(bert_label)
199-
bert_tokens.append("[SEP]")
200-
bert_labels.append("[SEP]")
199+
# bert_tokens.append("[SEP]")
200+
# bert_labels.append("[SEP]")
201201
if is_meta:
202202
meta_tokens.append([0] * len(meta[0]))
203-
orig_tokens = ["[CLS]"] + orig_tokens + ["[SEP]"]
203+
# + ["[SEP]"]
204+
orig_tokens = ["[CLS]"] + orig_tokens
204205

205206
input_ids = tokenizer.convert_tokens_to_ids(bert_tokens)
206207
labels = bert_labels

modules/layers/embedders.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -120,19 +120,11 @@ def create(cls,
120120
class Word2VecEmbedder(nn.Module):
121121
def __init__(self,
122122
vocab_size,
123-
embedding_dim=300,
124-
padding_idx=0,
125-
trainable=True,
126-
normalize=True):
123+
embedding_dim=300):
127124
super(Word2VecEmbedder, self).__init__()
128-
self.pad_id = padding_idx
129125
self.vocab_size = vocab_size
130126
self.embedding_dim = embedding_dim
131127
self.model = nn.Embedding(vocab_size, embedding_dim, padding_idx=self.pad_id)
132-
133-
self.trainable = trainable
134-
self.normalize = normalize
135-
136128
if normalize:
137129
weight = self.embedding.weight
138130
norms = weight.data.norm(2, 1)

0 commit comments

Comments
 (0)
0