Skip to content

Commit da188aa

Browse files
fix: explicitly check for null when adding BPE node to queue
1 parent 648e1b4 commit da188aa

File tree

4 files changed

+90
-39
lines changed

4 files changed

+90
-39
lines changed
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
name: Build Rindow Matlib
2+
3+
on:
4+
workflow_dispatch:
5+
6+
jobs:
7+
build:
8+
name: Build rindow-matlib (${{ matrix.arch }})
9+
strategy:
10+
fail-fast: false
11+
matrix:
12+
include:
13+
- arch: x86_64
14+
runs-on: ubuntu-24.04
15+
artifact_name: linux-x86_64
16+
- arch: arm64
17+
runs-on: ubuntu-24.04-arm
18+
artifact_name: linux-arm64
19+
20+
runs-on: ${{ matrix.runs-on }}
21+
22+
steps:
23+
- name: Checkout code
24+
uses: actions/checkout@v4
25+
with:
26+
repository: rindow/rindow-matlib
27+
28+
- name: Install dependencies
29+
run: |
30+
sudo apt-get update
31+
sudo apt-get install -y build-essential cmake
32+
33+
- name: Build
34+
run: |
35+
cmake -S . -B build
36+
cmake --build build --config Release
37+
38+
- name: Run tests
39+
run: |
40+
cd build
41+
ctest -C Release
42+
43+
- name: Package
44+
run: |
45+
cd build
46+
cpack -G TGZ -C Release
47+
48+
- name: Upload package artifact
49+
uses: actions/upload-artifact@v4
50+
with:
51+
name: ${{ matrix.artifact_name }}
52+
path: packages/*

src/PreTrainedTokenizers/LlamaTokenizer.php

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ class LlamaTokenizer extends PreTrainedTokenizer
1414
protected string $defaultChatTemplate = "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\n' + system_message + '\n<</SYS>>\n\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\n' + content.strip() + '\n<</SYS>>\n\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}";
1515

1616
public const DEFAULT_SYSTEM_PROMPT =
17-
"You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your " .
17+
"You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your " .
1818
"answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure " .
1919
"that your responses are socially unbiased and positive in nature.\n\n" .
2020
"If a question does not make any sense, or is not factually coherent, explain why instead of answering something not " .
@@ -38,7 +38,6 @@ public function __construct(array $tokenizerJSON, array $tokenizerConfig)
3838
'prepend_scheme' => 'first',
3939
]);
4040
}
41-
4241
}
4342

4443
/**
@@ -50,7 +49,7 @@ public function __construct(array $tokenizerJSON, array $tokenizerConfig)
5049
* @param bool $addSpecialTokens
5150
* @return ?array
5251
*/
53-
public function encodeText(?string $text, string $textPair = null, bool $addSpecialTokens = true): ?array
52+
public function encodeText(?string $text, ?string $textPair = null, bool $addSpecialTokens = true): ?array
5453
{
5554
if ($text === null) {
5655
return null;

src/Tokenizers/BPEModel.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ public function addNodeToQueue(SplPriorityQueue $queue, BPENode $node): void
221221
$pairKey = json_encode([$node->token, $node->next->token]);
222222
$rank = $this->bpeRanks[$pairKey] ?? null;
223223

224-
if ($rank) {
224+
if ($rank !== null) {
225225
$node->score = - ($rank + $node->bias);
226226
$queue->insert($node, $node->score);
227227
}

tests/tokenizers/TokenizersTest.php

Lines changed: 35 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,11 @@
2121

2222
$tokenizer = AutoTokenizer::fromPretrained($tokenizerId);
2323

24-
if (is_string($test['input']))
25-
{
24+
if (is_string($test['input'])) {
2625
// Tokenize the input text
2726
$encoded = $tokenizer($test['input'], returnTensor: false);
2827

29-
// Add the input text to the encoded object for easier debugging
28+
// Add the input text to the encoded object for easier debugging
3029
$test['encoded']['input'] = $encoded['input'] = $test['input'];
3130

3231
expect($encoded)->toMatchArray($test['encoded']);
@@ -41,7 +40,7 @@
4140

4241
$decodedWithoutSpecial = $tokenizer->decode($encoded['input_ids'], skipSpecialTokens: true);
4342
expect($decodedWithoutSpecial)->toBe($test['decoded_without_special']);
44-
} else{
43+
} else {
4544

4645
['text' => $text, 'text_pair' => $textPair] = $test['input'];
4746

@@ -50,7 +49,7 @@
5049
expect($encoded)->toMatchArray($test['output']);
5150
}
5251
})
53-
->with('regular-tokenization');
52+
->with('regular-tokenization');
5453
});
5554

5655
describe('Chat templates', function () {
@@ -70,6 +69,7 @@
7069

7170
$inputIds = $tokenizer->applyChatTemplate($chat, returnTensor: false);
7271

72+
7373
expect($inputIds)
7474
->toBe([1, 733, 16289, 28793, 22557, 28725, 910, 460, 368, 28804, 733, 28748, 16289, 28793, 28737, 28742, 28719, 2548, 1598, 28723, 1602, 541, 315, 1316, 368, 3154, 28804, 2, 28705, 733, 16289, 28793, 315, 28742, 28715, 737, 298, 1347, 805, 910, 10706, 5752, 1077, 3791, 28808, 733, 28748, 16289, 28793]);
7575
});
@@ -83,35 +83,35 @@
8383
['role' => 'user', 'content' => "I'd like to show off how chat templating works!"],
8484
];
8585

86-
$chatTemplate = "{% if messages[0]['role'] == 'system' %}".
87-
"{% set loop_messages = messages[1:] %}".
88-
"{% set system_message = messages[0]['content'] %}".
89-
"{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}".
90-
"{% set loop_messages = messages %}".
91-
"{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}".
92-
"{% else %}".
93-
"{% set loop_messages = messages %}".
94-
"{% set system_message = false %}".
95-
"{% endif %}".
96-
"{% if loop_messages|length == 0 and system_message %}".
97-
"{{ bos_token + '[INST] <<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n [/INST]' }}".
98-
"{% endif %}".
99-
"{% for message in loop_messages %}".
100-
"{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}".
101-
"{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}".
102-
"{% endif %}".
103-
"{% if loop.index0 == 0 and system_message != false %}".
104-
"{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}".
105-
"{% else %}".
106-
"{% set content = message['content'] %}".
107-
"{% endif %}".
108-
"{% if message['role'] == 'user' %}".
109-
"{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}".
110-
"{% elif message['role'] == 'system' %}".
111-
"{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}".
112-
"{% elif message['role'] == 'assistant' %}".
113-
"{{ ' ' + content.strip() + ' ' + eos_token }}".
114-
"{% endif %}".
86+
$chatTemplate = "{% if messages[0]['role'] == 'system' %}" .
87+
"{% set loop_messages = messages[1:] %}" .
88+
"{% set system_message = messages[0]['content'] %}" .
89+
"{% elif USE_DEFAULT_PROMPT == true and not '<<SYS>>' in messages[0]['content'] %}" .
90+
"{% set loop_messages = messages %}" .
91+
"{% set system_message = 'DEFAULT_SYSTEM_MESSAGE' %}" .
92+
"{% else %}" .
93+
"{% set loop_messages = messages %}" .
94+
"{% set system_message = false %}" .
95+
"{% endif %}" .
96+
"{% if loop_messages|length == 0 and system_message %}" .
97+
"{{ bos_token + '[INST] <<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n [/INST]' }}" .
98+
"{% endif %}" .
99+
"{% for message in loop_messages %}" .
100+
"{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}" .
101+
"{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}" .
102+
"{% endif %}" .
103+
"{% if loop.index0 == 0 and system_message != false %}" .
104+
"{% set content = '<<SYS>>\\n' + system_message + '\\n<</SYS>>\\n\\n' + message['content'] %}" .
105+
"{% else %}" .
106+
"{% set content = message['content'] %}" .
107+
"{% endif %}" .
108+
"{% if message['role'] == 'user' %}" .
109+
"{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}" .
110+
"{% elif message['role'] == 'system' %}" .
111+
"{{ '<<SYS>>\\n' + content.strip() + '\\n<</SYS>>\\n\\n' }}" .
112+
"{% elif message['role'] == 'assistant' %}" .
113+
"{{ ' ' + content.strip() + ' ' + eos_token }}" .
114+
"{% endif %}" .
115115
"{% endfor %}";
116116

117117
$chatTemplate = str_replace('USE_DEFAULT_PROMPT', 'true', $chatTemplate);
@@ -173,7 +173,7 @@
173173
$tokenizer = AutoTokenizer::fromPretrained('Xenova/bert-base-uncased');
174174

175175
// Expected to throw error if jagged array
176-
expect(fn () => $tokenizer->tokenize($inputs))->toThrow('Unable to create tensor');
176+
expect(fn() => $tokenizer->tokenize($inputs))->toThrow('Unable to create tensor');
177177

178178
// Truncation
179179
['input_ids' => $inputIds, 'attention_mask' => $attentionMask, 'token_type_ids' => $tokenTypeIds] = $tokenizer

0 commit comments

Comments
 (0)