|
21 | 21 |
|
22 | 22 | $tokenizer = AutoTokenizer::fromPretrained($tokenizerId); |
23 | 23 |
|
24 | | - if (is_string($test['input'])) |
25 | | - { |
| 24 | + if (is_string($test['input'])) { |
26 | 25 | // Tokenize the input text |
27 | 26 | $encoded = $tokenizer($test['input'], returnTensor: false); |
28 | 27 |
|
29 | | - // Add the input text to the encoded object for easier debugging |
| 28 | + // Add the input text to the encoded object for easier debugging |
30 | 29 | $test['encoded']['input'] = $encoded['input'] = $test['input']; |
31 | 30 |
|
32 | 31 | expect($encoded)->toMatchArray($test['encoded']); |
|
41 | 40 |
|
42 | 41 | $decodedWithoutSpecial = $tokenizer->decode($encoded['input_ids'], skipSpecialTokens: true); |
43 | 42 | expect($decodedWithoutSpecial)->toBe($test['decoded_without_special']); |
44 | | - } else{ |
| 43 | + } else { |
45 | 44 |
|
46 | 45 | ['text' => $text, 'text_pair' => $textPair] = $test['input']; |
47 | 46 |
|
|
50 | 49 | expect($encoded)->toMatchArray($test['output']); |
51 | 50 | } |
52 | 51 | }) |
53 | | - ->with('regular-tokenization'); |
| 52 | + ->with('regular-tokenization'); |
54 | 53 | }); |
55 | 54 |
|
56 | 55 | describe('Chat templates', function () { |
|
70 | 69 |
|
71 | 70 | $inputIds = $tokenizer->applyChatTemplate($chat, returnTensor: false); |
72 | 71 |
|
| 72 | + |
73 | 73 | expect($inputIds) |
74 | 74 | ->toBe([1, 733, 16289, 28793, 22557, 28725, 910, 460, 368, 28804, 733, 28748, 16289, 28793, 28737, 28742, 28719, 2548, 1598, 28723, 1602, 541, 315, 1316, 368, 3154, 28804, 2, 28705, 733, 16289, 28793, 315, 28742, 28715, 737, 298, 1347, 805, 910, 10706, 5752, 1077, 3791, 28808, 733, 28748, 16289, 28793]); |
75 | 75 | }); |
|
83 | 83 | ['role' => 'user', 'content' => "I'd like to show off how chat templating works!"], |
84 | 84 | ]; |
85 | 85 |
|
86 | | - $chatTemplate = "{% if messages[0]['role'] == 'system' %}". |
87 | | - "{% set loop_messages = messages[1:] %}". |
88 | | - "{% set system_message = messages[0]['content'] %}". |
89 | | - "{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}". |
90 | | - "{% set loop_messages = messages %}". |
91 | | - "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}". |
92 | | - "{% else %}". |
93 | | - "{% set loop_messages = messages %}". |
94 | | - "{% set system_message = false %}". |
95 | | - "{% endif %}". |
96 | | - "{% if loop_messages|length == 0 and system_message %}". |
97 | | - "{{ bos_token + '[INST] <<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n [/INST]' }}". |
98 | | - "{% endif %}". |
99 | | - "{% for message in loop_messages %}". |
100 | | - "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}". |
101 | | - "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}". |
102 | | - "{% endif %}". |
103 | | - "{% if loop.index0 == 0 and system_message != false %}". |
104 | | - "{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}". |
105 | | - "{% else %}". |
106 | | - "{% set content = message['content'] %}". |
107 | | - "{% endif %}". |
108 | | - "{% if message['role'] == 'user' %}". |
109 | | - "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}". |
110 | | - "{% elif message['role'] == 'system' %}". |
111 | | - "{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}". |
112 | | - "{% elif message['role'] == 'assistant' %}". |
113 | | - "{{ ' ' + content.strip() + ' ' + eos_token }}". |
114 | | - "{% endif %}". |
| 86 | + $chatTemplate = "{% if messages[0]['role'] == 'system' %}" . |
| 87 | + "{% set loop_messages = messages[1:] %}" . |
| 88 | + "{% set system_message = messages[0]['content'] %}" . |
| 89 | + "{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}" . |
| 90 | + "{% set loop_messages = messages %}" . |
| 91 | + "{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" . |
| 92 | + "{% else %}" . |
| 93 | + "{% set loop_messages = messages %}" . |
| 94 | + "{% set system_message = false %}" . |
| 95 | + "{% endif %}" . |
| 96 | + "{% if loop_messages|length == 0 and system_message %}" . |
| 97 | + "{{ bos_token + '[INST] <<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n [/INST]' }}" . |
| 98 | + "{% endif %}" . |
| 99 | + "{% for message in loop_messages %}" . |
| 100 | + "{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" . |
| 101 | + "{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" . |
| 102 | + "{% endif %}" . |
| 103 | + "{% if loop.index0 == 0 and system_message != false %}" . |
| 104 | + "{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}" . |
| 105 | + "{% else %}" . |
| 106 | + "{% set content = message['content'] %}" . |
| 107 | + "{% endif %}" . |
| 108 | + "{% if message['role'] == 'user' %}" . |
| 109 | + "{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}" . |
| 110 | + "{% elif message['role'] == 'system' %}" . |
| 111 | + "{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}" . |
| 112 | + "{% elif message['role'] == 'assistant' %}" . |
| 113 | + "{{ ' ' + content.strip() + ' ' + eos_token }}" . |
| 114 | + "{% endif %}" . |
115 | 115 | "{% endfor %}"; |
116 | 116 |
|
117 | 117 | $chatTemplate = str_replace('USE_DEFAULT_PROMPT', 'true', $chatTemplate); |
|
173 | 173 | $tokenizer = AutoTokenizer::fromPretrained('Xenova/bert-base-uncased'); |
174 | 174 |
|
175 | 175 | // Expected to throw error if jagged array |
176 | | - expect(fn () => $tokenizer->tokenize($inputs))->toThrow('Unable to create tensor'); |
| 176 | + expect(fn() => $tokenizer->tokenize($inputs))->toThrow('Unable to create tensor'); |
177 | 177 |
|
178 | 178 | // Truncation |
179 | 179 | ['input_ids' => $inputIds, 'attention_mask' => $attentionMask, 'token_type_ids' => $tokenTypeIds] = $tokenizer |
|
0 commit comments