kajyuuen commited on
Commit
065e10a
1 Parent(s): 6b4e753

Use FastTokenizer

Browse files
Files changed (4) hide show
  1. README.md +1 -2
  2. spiece.model +2 -2
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +105 -15
README.md CHANGED
@@ -12,14 +12,13 @@ This repository provides large language models trained by [SB Intuitions](https:
12
 
13
  ## How to use
14
 
15
- Please set **use_fast=False** to use our tokenizer properly.
16
 
17
  ```python
18
  import torch
19
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, set_seed
20
 
21
  model = AutoModelForCausalLM.from_pretrained("sbintuitions/sarashina2-7b", torch_dtype=torch.bfloat16, device_map="auto")
22
- tokenizer = AutoTokenizer.from_pretrained("sbintuitions/sarashina2-7b", use_fast=False)
23
  generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
24
  set_seed(123)
25
 
 
12
 
13
  ## How to use
14
 
 
15
 
16
  ```python
17
  import torch
18
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, set_seed
19
 
20
  model = AutoModelForCausalLM.from_pretrained("sbintuitions/sarashina2-7b", torch_dtype=torch.bfloat16, device_map="auto")
21
+ tokenizer = AutoTokenizer.from_pretrained("sbintuitions/sarashina2-7b")
22
  generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
23
  set_seed(123)
24
 
spiece.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2220731d9779e9df0d664d793750f216a4c829ab1d9fa028eebb84f2e2ed8f61
3
- size 1831820
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aa69ad88e8e7a8c6cb65978a4a08a34755c42e055ea755a12f5b89b804212e1
3
+ size 1831860
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,16 +1,106 @@
1
  {
2
- "extra_ids": 0,
3
- "do_lower_case": false,
4
- "keep_accents": true,
5
- "bos_token": "<s>",
6
- "eos_token": "</s>",
7
- "unk_token": "<unk>",
8
- "pad_token": "<pad>",
9
- "mask_token": "<mask>",
10
- "cls_token": "<cls>",
11
- "sep_token": "<sep>",
12
- "padding_side": "left",
13
- "sp_model_kwargs": {},
14
- "special_tokens_map_file": null,
15
- "tokenizer_class": "T5Tokenizer"
16
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
+ "extra_ids": 0,
3
+ "do_lower_case": false,
4
+ "keep_accents": true,
5
+ "bos_token": "<s>",
6
+ "eos_token": "</s>",
7
+ "unk_token": "<unk>",
8
+ "pad_token": "<pad>",
9
+ "mask_token": "<mask>",
10
+ "cls_token": "<cls>",
11
+ "sep_token": "<sep>",
12
+ "padding_side": "left",
13
+ "sp_model_kwargs": {},
14
+ "special_tokens_map_file": null,
15
+ "tokenizer_class": "T5Tokenizer",
16
+ "added_tokens_decoder": {
17
+ "7": {
18
+ "content": "<|system|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false,
23
+ "special": false
24
+ },
25
+ "8": {
26
+ "content": "<|assistant|>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false,
31
+ "special": false
32
+ },
33
+ "9": {
34
+ "content": "<|user|>",
35
+ "lstrip": false,
36
+ "normalized": false,
37
+ "rstrip": false,
38
+ "single_word": false,
39
+ "special": false
40
+ },
41
+ "10": {
42
+ "content": "<|available_apis|>",
43
+ "lstrip": false,
44
+ "normalized": false,
45
+ "rstrip": false,
46
+ "single_word": false,
47
+ "special": false
48
+ },
49
+ "11": {
50
+ "content": "<|api_calls|>",
51
+ "lstrip": false,
52
+ "normalized": false,
53
+ "rstrip": false,
54
+ "single_word": false,
55
+ "special": false
56
+ },
57
+ "12": {
58
+ "content": "<|api_results|>",
59
+ "lstrip": false,
60
+ "normalized": false,
61
+ "rstrip": false,
62
+ "single_word": false,
63
+ "special": false
64
+ },
65
+ "13": {
66
+ "content": "<|code|>",
67
+ "lstrip": false,
68
+ "normalized": false,
69
+ "rstrip": false,
70
+ "single_word": false,
71
+ "special": false
72
+ },
73
+ "14": {
74
+ "content": "<|file|>",
75
+ "lstrip": false,
76
+ "normalized": false,
77
+ "rstrip": false,
78
+ "single_word": false,
79
+ "special": false
80
+ },
81
+ "102397": {
82
+ "content": "<|prefix|>",
83
+ "lstrip": false,
84
+ "normalized": false,
85
+ "rstrip": false,
86
+ "single_word": false,
87
+ "special": false
88
+ },
89
+ "102398": {
90
+ "content": "<|suffix|>",
91
+ "lstrip": false,
92
+ "normalized": false,
93
+ "rstrip": false,
94
+ "single_word": false,
95
+ "special": false
96
+ },
97
+ "102399": {
98
+ "content": "<|middle|>",
99
+ "lstrip": false,
100
+ "normalized": false,
101
+ "rstrip": false,
102
+ "single_word": false,
103
+ "special": false
104
+ }
105
+ }
106
+ }