Fix tokenizer with \n\n (#15)
parent
ce666962e7
commit
d76ed403c3
|
|
@ -117,6 +117,11 @@ def to_word_list_format(word_dict, tokenizer):
|
|||
item_flat_ids += ids
|
||||
item_offsets.append(len(ids))
|
||||
|
||||
if word == "\n\n":
|
||||
ids = tokenizer.encode("\n") * 2
|
||||
item_flat_ids += ids
|
||||
item_offsets.append(len(ids))
|
||||
|
||||
flat_ids.append(np.array(item_flat_ids))
|
||||
offsets.append(np.cumsum(np.array(item_offsets)))
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue