From d76ed403c330b41cb7aa1716a591322d4db33e0c Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Sun, 26 Mar 2023 19:47:32 +0800 Subject: [PATCH] Fix tokenizer with \n\n (#15) --- tabby/server/triton.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tabby/server/triton.py b/tabby/server/triton.py index 774578f..148951a 100644 --- a/tabby/server/triton.py +++ b/tabby/server/triton.py @@ -117,6 +117,11 @@ def to_word_list_format(word_dict, tokenizer): item_flat_ids += ids item_offsets.append(len(ids)) + if word == "\n\n": + ids = tokenizer.encode("\n") * 2 + item_flat_ids += ids + item_offsets.append(len(ids)) + flat_ids.append(np.array(item_flat_ids)) offsets.append(np.cumsum(np.array(item_offsets)))