CodeWithKyrian
diff --git a/‎examples/bootstrap.php‎
Lines changed: 3 additions & 2 deletions b/‎examples/bootstrap.php‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/pipelines/asr.php‎
Lines changed: 7 additions & 4 deletions b/‎examples/pipelines/asr.php‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎examples/pipelines/text-classification.php‎
Lines changed: 3 additions & 7 deletions b/‎examples/pipelines/text-classification.php‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎examples/pipelines/text-generation.php‎
Lines changed: 2 additions & 1 deletion b/‎examples/pipelines/text-generation.php‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/pipelines/text2text-generation.php‎
Lines changed: 3 additions & 3 deletions b/‎examples/pipelines/text2text-generation.php‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/pipelines/token-classification.php‎
Lines changed: 2 additions & 2 deletions b/‎examples/pipelines/token-classification.php‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/Decoders/ByteFallback.php‎
Lines changed: 19 additions & 8 deletions b/‎src/Decoders/ByteFallback.php‎
Lines changed: 19 additions & 8 deletions
diff --git a/‎src/Decoders/ByteLevelDecoder.php‎
Lines changed: 4 additions & 7 deletions b/‎src/Decoders/ByteLevelDecoder.php‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎src/Decoders/CTCDecoder.php‎
Lines changed: 3 additions & 3 deletions b/‎src/Decoders/CTCDecoder.php‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/Decoders/ReplaceDecoder.php‎
Lines changed: 1 addition & 2 deletions b/‎src/Decoders/ReplaceDecoder.php‎
Lines changed: 1 addition & 2 deletions
@@ -7,5 +7,6 @@
 
 require_once './vendor/autoload.php';
 
-Transformers::setup()->setImageDriver(ImageDriver::VIPS);
-
+Transformers::setup()
+    ->setCacheDir('/Users/Kyrian/.transformers')
+    ->setImageDriver(ImageDriver::VIPS);
@@ -22,14 +22,17 @@
 //$audioUrl = __DIR__ . '/../sounds/taunt.wav';
 //$audioUrl = __DIR__ . '/../sounds/gettysburg.wav';
 //$audioUrl = __DIR__ . '/../sounds/kyrian-speaking.wav';
-//$audioUrl = __DIR__ . '/../sounds/ted_60.wav';
-$audioUrl = __DIR__ . '/../sounds/sample-1.mp3';
+$audioUrl = __DIR__ . '/../sounds/ted_60.wav';
+//$audioUrl = __DIR__ . '/../sounds/sample-1.mp3';
 
+$streamer = WhisperTextStreamer::make()
+//->onTimestampStart(fn($timestamp) => dump($timestamp));
+->onStream(fn($text) => print($text));
 
 $output = $transcriber($audioUrl,
     maxNewTokens: 256,
     chunkLengthSecs: 24,
-//    returnTimestamps: 'word',
+    streamer: $streamer,
 );
 
-dd($output, timeUsage(), memoryUsage());
+dd($output, timeUsage(), memoryUsage());
@@ -7,17 +7,13 @@
 require_once './bootstrap.php';
 
 
-//$classifier = pipeline('text-classification', 'Xenova/toxic-bert');
+$classifier = pipeline('text-classification', 'Xenova/toxic-bert');
 //
 //$result = $classifier("I hate you! You gave me life but in misery", topK: -1);
 
 
-$classifier = pipeline('text-classification', 'Xenova/distilbert-base-uncased-mnli');
-
-$result = $classifier('I love you!, You frustrated my life');
+// $classifier = pipeline('text-classification', 'Xenova/distilbert-base-uncased-mnli');
 
+$result = $classifier('I want to beat him to pulp', topK: -1);
 
 dd($result);
-
-
-
@@ -14,7 +14,8 @@
 
 //$generator = pipeline('text-generation', 'Xenova/gpt2');
 //$generator = pipeline('text-generation', 'Xenova/Qwen1.5-0.5B-Chat');
-$generator = pipeline('text-generation', 'Xenova/TinyLlama-1.1B-Chat-v1.0');
+//$generator = pipeline('text-generation', 'Xenova/TinyLlama-1.1B-Chat-v1.0');
+$generator = pipeline('text-generation', 'onnx-community/Llama-3.2-1B-Instruct', modelFilename: 'model_q4');
 
 $streamer = TextStreamer::make()->shouldSkipPrompt();
 
 
@@ -9,8 +9,8 @@
 
 ini_set('memory_limit', -1);
 
-$generator = pipeline('text2text-generation', 'Xenova/LaMini-Flan-T5-783M');
-//$generator = pipeline('text2text-generation', 'Xenova/flan-t5-small', quantized: true);
+//$generator = pipeline('text2text-generation', 'Xenova/LaMini-Flan-T5-783M');
+$generator = pipeline('text2text-generation', 'Xenova/flan-t5-small', quantized: true);
 
 $streamer = TextStreamer::make();
 
@@ -22,4 +22,4 @@
 $output = $generator($query, streamer: $streamer, maxNewTokens: 256, doSample: true, repetitionPenalty: 1.1, temperature: 0.7);
 
 //dd($output);
-dd('Done', timeUsage(), memoryUsage());
+dd('Done', timeUsage(), memoryUsage());
@@ -11,8 +11,8 @@
 
 ini_set('memory_limit', -1);
 
-//$classifier = pipeline('token-classification', 'Xenova/bert-base-NER');
-$classifier = pipeline('token-classification', 'codewithkyrian/bert-english-uncased-finetuned-pos');
+ $classifier = pipeline('token-classification', 'Xenova/bert-base-NER');
+//$classifier = pipeline('token-classification', 'codewithkyrian/bert-english-uncased-finetuned-pos');
 
 $output = $classifier(
     'My name is Kyrian and I live in Nigeria',
 
@@ -18,31 +18,42 @@ public function __construct(array $config)
 
     protected function decodeChain(array $tokens): array
     {
-        $newTokens = [];
         $previousByteTokens = [];
+        $newTokens = [];
 
         foreach ($tokens as $token) {
             $bytes = null;
+
+            // Check if the token is of the form <0xXX>
             if (strlen($token) === 6 && str_starts_with($token, '<0x') && str_ends_with($token, '>')) {
+                // Extract the hexadecimal value from the token
                 $byte = hexdec(substr($token, 3, 2));
                 if (!is_nan($byte)) {
                     $bytes = $byte;
                 }
             }
+
             if ($bytes !== null) {
+                // Add byte to previousByteTokens
                 $previousByteTokens[] = $bytes;
             } else {
-                if (count($previousByteTokens) > 0) {
-                    $string = $this->bytesToString($previousByteTokens);
-                    $newTokens[] = $string;
-                    $previousByteTokens = [];
+                // If we have accumulated byte tokens, decode them to a string
+                if (!empty($previousByteTokens)) {
+                    $string = pack('C*', ...$previousByteTokens);  // Convert bytes back to string
+                    $newTokens[] = $string;  // Add decoded string to newTokens
+                    $previousByteTokens = [];  // Reset byte accumulator
                 }
+                // Add the non-byte token to newTokens
                 $newTokens[] = $token;
             }
         }
-        if (count($previousByteTokens) > 0) {
-            $string = $this->bytesToString($previousByteTokens);
+
+
+        // After the loop, if there are still byte tokens, decode them
+        if (!empty($previousByteTokens)) {
+            $string = pack('C*', ...$previousByteTokens);  // Convert remaining bytes to string
             $newTokens[] = $string;
+            $previousByteTokens = [];  // Reset byte accumulator
         }
 
         return $newTokens;
@@ -59,4 +70,4 @@ protected function bytesToString(array $bytes): string
         $binaryString = pack('C*', ...$bytes);
         return mb_convert_encoding($binaryString, 'ISO-8859-1');
     }
-}
+}
@@ -6,7 +6,7 @@
 namespace Codewithkyrian\Transformers\Decoders;
 
 use Codewithkyrian\Transformers\Tokenizers\AddedToken;
-use Codewithkyrian\Transformers\Tokenizers\Tokenizer;
+use Codewithkyrian\Transformers\Tokenizers\TokenizerModel;
 use SplFixedArray;
 
 class ByteLevelDecoder extends Decoder
@@ -287,7 +287,7 @@ public function convertTokensToString(array $tokens): string
 
         $binaryString = pack('C*', ...$byteArray);
 
-        return mb_convert_encoding($binaryString, 'ISO-8859-1');
+        return mb_convert_encoding($binaryString, 'UTF-8');
     }
 
     protected function decodeChain(array $tokens): array
@@ -298,9 +298,7 @@ protected function decodeChain(array $tokens): array
         foreach ($tokens as $token) {
             // No need to check skip_special_tokens since the tokens are already filtered
 
-            $addedToken = array_filter($this->addedTokens, function (AddedToken $x) use ($token) {
-                return $x->content === $token;
-            });
+            $addedToken = array_filter($this->addedTokens, fn (AddedToken $x) => $x->content === $token);
 
             if (!empty($addedToken)) {
                 if (!empty($currentSubText)) {
@@ -319,7 +317,6 @@ protected function decodeChain(array $tokens): array
         }
 
         // TODO: add spaces_between_special_tokens and clean_up_tokenization_spaces options
-
         return $subTexts;
     }
-}
+}
@@ -5,7 +5,7 @@
 
 namespace Codewithkyrian\Transformers\Decoders;
 
-use Codewithkyrian\Transformers\Tokenizers\Tokenizer;
+use Codewithkyrian\Transformers\Tokenizers\TokenizerModel;
 
 /**
  * The CTC (Connectionist Temporal Classification) decoder.
@@ -65,7 +65,7 @@ function convertTokensToString(array $tokens): string
         $text = implode('', $filteredTokens);
         if ($this->cleanup) {
             // Cleanup and replace delimiter token
-            $text = trim(str_replace($this->wordDelimiterToken, ' ', Tokenizer::cleanUpTokenization($text)));
+            $text = trim(str_replace($this->wordDelimiterToken, ' ', TokenizerModel::cleanUpTokenization($text)));
         }
 
         return $text;
@@ -75,4 +75,4 @@ protected function decodeChain(array $tokens): array
     {
         return [$this->convertTokensToString($tokens)];
     }
-}
+}
@@ -2,7 +2,6 @@
 
 declare(strict_types=1);
 
-
 namespace Codewithkyrian\Transformers\Decoders;
 
 class ReplaceDecoder extends Decoder
@@ -38,4 +37,4 @@ protected function decodeChain(array $tokens): array
             return $token;
         }, $tokens);
     }
-}
+}
Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@`
`5`	`5`
`6`	`6`	`namespace Codewithkyrian\Transformers\Decoders;`
`7`	`7`
`8`		`-use Codewithkyrian\Transformers\Tokenizers\Tokenizer;`
	`8`	`+use Codewithkyrian\Transformers\Tokenizers\TokenizerModel;`
`9`	`9`
`10`	`10`	`/**`
`11`	`11`	`* The CTC (Connectionist Temporal Classification) decoder.`
`@@ -65,7 +65,7 @@ function convertTokensToString(array $tokens): string`
`65`	`65`	`$text = implode('', $filteredTokens);`
`66`	`66`	`if ($this->cleanup) {`
`67`	`67`	`// Cleanup and replace delimiter token`
`68`		`- $text = trim(str_replace($this->wordDelimiterToken, ' ', Tokenizer::cleanUpTokenization($text)));`
	`68`	`+ $text = trim(str_replace($this->wordDelimiterToken, ' ', TokenizerModel::cleanUpTokenization($text)));`
`69`	`69`	`}`
`70`	`70`
`71`	`71`	`return $text;`
`@@ -75,4 +75,4 @@ protected function decodeChain(array $tokens): array`
`75`	`75`	`{`
`76`	`76`	`return [$this->convertTokensToString($tokens)];`
`77`	`77`	`}`
`78`		`-}`
	`78`	`+}`