Fix tokenizer with \n\n (#15)

add-more-languages
Meng Zhang 2023-03-26 19:47:32 +08:00 committed by GitHub
parent ce666962e7
commit d76ed403c3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 5 additions and 0 deletions

View File

@ -117,6 +117,11 @@ def to_word_list_format(word_dict, tokenizer):
item_flat_ids += ids
item_offsets.append(len(ids))
if word == "\n\n":
ids = tokenizer.encode("\n") * 2
item_flat_ids += ids
item_offsets.append(len(ids))
flat_ids.append(np.array(item_flat_ids))
offsets.append(np.cumsum(np.array(item_offsets)))