CodeWithKyrian
diff --git a/‎examples/.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎examples/.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/bootstrap.php‎
Lines changed: 1 addition & 1 deletion b/‎examples/bootstrap.php‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/misc/general-test.php‎
Lines changed: 0 additions & 21 deletions b/‎examples/misc/general-test.php‎
Lines changed: 0 additions & 21 deletions
diff --git a/‎examples/transformers.log‎ b/‎examples/transformers.log‎
diff --git a/‎src/FeatureExtractors/ASTFeatureExtractor.php‎
Lines changed: 4 additions & 4 deletions b/‎src/FeatureExtractors/ASTFeatureExtractor.php‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/FeatureExtractors/DetrFeatureExtractor.php‎
Lines changed: 4 additions & 5 deletions b/‎src/FeatureExtractors/DetrFeatureExtractor.php‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎src/FeatureExtractors/FeatureExtractor.php‎
Lines changed: 1 addition & 1 deletion b/‎src/FeatureExtractors/FeatureExtractor.php‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/FeatureExtractors/ImageFeatureExtractor.php‎
Lines changed: 14 additions & 70 deletions b/‎src/FeatureExtractors/ImageFeatureExtractor.php‎
Lines changed: 14 additions & 70 deletions
diff --git a/‎src/FeatureExtractors/WhisperFeatureExtractor.php‎
Lines changed: 9 additions & 9 deletions b/‎src/FeatureExtractors/WhisperFeatureExtractor.php‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎src/Models/ModelArchitecture.php‎
Lines changed: 14 additions & 16 deletions b/‎src/Models/ModelArchitecture.php‎
Lines changed: 14 additions & 16 deletions
@@ -1,4 +1,5 @@
 vendor
 .transformers-cache/*
 composer.lock
-paddleocr
+paddleocr
+*.log
@@ -16,7 +16,7 @@ public function __construct(protected string $filename) {}
 
     public function log($level, $message, array $context = []): void
     {
-        $line = sprintf("[%s][%s] %s %s\n", date('Y-m-d H:i:s'), strtoupper($level), $message, empty($context) ? '' : json_encode($context));
+        $line = sprintf("[%s][%s] %s %s\n", date('Y-m-d H:i:s'), strtoupper($level), $message, empty($context) ? '' : json_encode($context, JSON_UNESCAPED_SLASHES));
         file_put_contents($this->filename, $line, FILE_APPEND);
     }
 }
 
@@ -46,13 +46,13 @@ public function __construct(array $config)
 
     /**
      *  Extracts features from a given audio using the provided configuration.
-     * @param Tensor $waveform The audio tensor to extract features from.
+     * @param Tensor $input The audio tensor to extract features from.
      * @return Tensor[] The extracted features.
      */
-    public function __invoke(Tensor $waveform): array
+    public function __invoke($input, ...$args): array
     {
         $features = Audio::spectrogram(
-            $waveform,
+            $input,
             $this->window,
             frameLength: 400,
             hopLength: 160,
@@ -72,4 +72,4 @@ public function __invoke(Tensor $waveform): array
             'input_values' => $features->add(-$this->mean)->multiply(1 / $this->std)->unsqueeze(0)
         ];
     }
-}
+}
@@ -16,13 +16,12 @@ class DetrFeatureExtractor extends ImageFeatureExtractor
     /**
      * Calls the feature extraction process on an array of images, preprocesses
      * each image, and concatenates the resulting features into a single Tensor.
-     * @param Image|array $images The image(s) to extract features from.
+     * @param Image|array $input The image(s) to extract features from.
      * @return array An object containing the concatenated pixel values of the preprocessed images.
      */
-    public function __invoke(Image|array $images, ...$args): array
+    public function __invoke($input, ...$args): array
     {
-        $result = parent::__invoke($images, $args);
-
+        $result = parent::__invoke($input, $args);
 
         // TODO support differently-sized images, for now assume all images are the same size.
         // TODO support different mask sizes (not just 64x64)
@@ -49,4 +48,4 @@ public function postProcessObjectDetection(ObjectDetectionOutput $outputs, float
     {
         return Processor::postProcessObjectDetection($outputs, $threshold, $targetSizes, $isZeroShot);
     }
-}
+}
@@ -12,7 +12,7 @@ class FeatureExtractor
 {
     public function __construct(public array $config) {}
 
-    public function __invoke(mixed $input, ...$args)
+    public function __invoke($input, ...$args)
     {
         return $input;
     }
 
@@ -8,7 +8,6 @@
 use Codewithkyrian\Transformers\Tensor\Tensor;
 use Codewithkyrian\Transformers\Utils\Image;
 use Exception;
-use Imagine\Image\Point;
 
 class ImageFeatureExtractor extends FeatureExtractor
 {
@@ -78,61 +77,6 @@ public function __construct(public array $config)
         }
     }
 
-
-    /**
-     * Crops the margin of the image. Gray pixels are considered margin (i.e., pixels with a value below the threshold).
-     *
-     * @param int $grayThreshold Value below which pixels are considered to be gray.
-     *
-     * @return static The cropped image.
-     */
-    public function cropMargin(Image $image, int $grayThreshold = 200): static
-    {
-        $grayImage = $image->clone()->grayscale();
-
-        // Get the min and max pixel values
-        $minValue = min($grayImage->toTensor()->buffer())[0];
-        $maxValue = max($grayImage->toTensor()->buffer())[0];
-
-        $diff = $maxValue - $minValue;
-
-        // If all pixels have the same value, no need to crop
-        if ($diff === 0) {
-            return $this;
-        }
-
-        $threshold = $grayThreshold / 255;
-
-        $xMin = $image->width();
-        $yMin = $image->height();
-        $xMax = 0;
-        $yMax = 0;
-
-        $width = $image->width();
-        $height = $image->height();
-
-        // Iterate over each pixel in the image
-        for ($y = 0; $y < $height; ++$y) {
-            for ($x = 0; $x < $width; ++$x) {
-                $color = $grayImage->image->getColorAt(new Point($x, $y));
-                $pixelValue = $color->getRed(); // Assuming grayscale, so red channel is sufficient
-
-                if (($pixelValue - $minValue) / $diff < $threshold) {
-                    // We have a non-gray pixel, so update the min/max values accordingly
-                    $xMin = min($xMin, $x);
-                    $yMin = min($yMin, $y);
-                    $xMax = max($xMax, $x);
-                    $yMax = max($yMax, $y);
-                }
-            }
-        }
-
-        // Crop the image using the calculated bounds
-        $image->crop($xMin, $yMin, $xMax, $yMax);
-
-        return $this;
-    }
-
     /**
      * Pad the image by a certain amount.
      *
@@ -152,8 +96,7 @@ public function padImage(
         string    $mode = 'constant',
         bool      $center = false,
         int       $constantValues = 0
-    ): Tensor
-    {
+    ): Tensor {
         if ($tensorFormat === 'CHW') {
             [$imageChannels, $imageHeight, $imageWidth] = $imageTensor->shape();
         } else {
@@ -324,7 +267,7 @@ public function getResizeOutputImageSize(Image $image, int|array|null $size): ar
         } elseif ($this->sizeDivisibility != null) {
             return $this->enforceSizeDivisibility([$srcWidth, $srcHeight], $this->sizeDivisibility);
         } else {
-            throw new Exception("Could not resize image due to unsupported 'size' parameter passed: ".json_encode($size));
+            throw new Exception("Could not resize image due to unsupported 'size' parameter passed: " . json_encode($size));
         }
     }
 
@@ -347,8 +290,7 @@ public function preprocess(
         ?bool $doPad = null,
         ?bool $doConvertRGB = null,
         ?bool $doConvertGrayscale = null
-    ): array
-    {
+    ): array {
         if ($this->doCropMargin) {
             // Specific to nougat processors. This is done before resizing,
             // and can be interpreted as a pre-preprocessing step.
@@ -400,7 +342,7 @@ public function preprocess(
         if ($doNormalize ?? $this->doNormalize) {
             if (is_array($this->imageMean)) {
                 // Negate the mean values to add instead of subtract
-                $negatedMean = array_map(fn ($mean) => -$mean, $this->imageMean);
+                $negatedMean = array_map(fn($mean) => -$mean, $this->imageMean);
                 $imageMean = Tensor::repeat($negatedMean, $image->height() * $image->width(), 1);
             } else {
                 $imageMean = Tensor::fill([$image->channels * $image->height() * $image->width()], -$this->imageMean);
@@ -409,7 +351,7 @@ public function preprocess(
 
             if (is_array($this->imageStd)) {
                 // Inverse the standard deviation values to multiple instead of divide
-                $inversedStd = array_map(fn ($std) => 1 / $std, $this->imageStd);
+                $inversedStd = array_map(fn($std) => 1 / $std, $this->imageStd);
                 $imageStd = Tensor::repeat($inversedStd, $image->height() * $image->width(), 1);
             } else {
                 $imageStd = Tensor::fill([$image->channels * $image->height() * $image->width()], 1 / $this->imageStd);
@@ -421,7 +363,7 @@ public function preprocess(
             $imageStd = $imageStd->reshape($imageTensor->shape());
 
             if (count($imageMean) !== $image->channels || count($imageStd) !== $image->channels) {
-                throw new Exception("When set to arrays, the length of `imageMean` (".count($imageMean).") and `imageStd` (".count($imageStd).") must match the number of channels in the image ({$image->channels}).");
+                throw new Exception("When set to arrays, the length of `imageMean` (" . count($imageMean) . ") and `imageStd` (" . count($imageStd) . ") must match the number of channels in the image ({$image->channels}).");
             }
 
             // Normalize pixel data
@@ -450,15 +392,17 @@ public function preprocess(
      * preprocesses each image, and concatenates the resulting
      * features into a single Tensor.
      *
-     * @param Image|Image[] $images The image(s) to extract features from.
+     * @param Image|Image[] $input The image(s) to extract features from.
      * @param mixed ...$args Additional arguments.
      *
      * @return array An object containing the concatenated pixel values (and other metadata) of the preprocessed images.
      */
-    public function __invoke(Image|array $images, ...$args): array
+    public function __invoke($input, ...$args): array
     {
-        if (!is_array($images)) {
-            $images = [$images];
+        $images = is_array($input) ? $input : [$input];
+
+        if (count($images) === 0) {
+            throw new Exception('No images provided');
         }
 
         $imageData = array_map([$this, 'preprocess'], $images);
@@ -516,6 +460,6 @@ private function constraintToMultipleOf(int $val, int $multiple, int $minVal = 0
             $x = ceil($a) * $multiple;
         }
 
-        return $x;
+        return (int)$x;
     }
-}
+}
@@ -32,26 +32,26 @@ public function __construct(array $config)
 
     /**
      *  Extracts features from a given audio using the provided configuration.
-     * @param Tensor $waveform The audio tensor to extract features from.
+     * @param Tensor $input The audio tensor to extract features from.
      * @return Tensor[] The extracted features.
      */
-    public function __invoke(Tensor $waveform): array
+    public function __invoke($input, ...$args): array
     {
-        if ($waveform->size() > $this->config['n_samples']) {
+        if ($input->size() > $this->config['n_samples']) {
             $logger = Transformers::getLogger();
             $logger->warning('Attempting to extract features for audio longer than 30 seconds.' .
                 'If using a pipeline to extract transcript from a long audio clip,' .
                 'remember to specify `chunkLengthSecs` and/or `strideLengthSecs` in the pipeline options.');
 
-            $waveform = $waveform->sliceWithBounds([0], [$this->config['n_samples']]);
-        } else if ($waveform->size() < $this->config['n_samples']) {
-            $padLength = $this->config['n_samples'] - $waveform->size();
-            $padding = Tensor::zeros([$padLength], dtype: $waveform->dtype());
-            $waveform = Tensor::concat([$waveform, $padding]);
+            $input = $input->sliceWithBounds([0], [$this->config['n_samples']]);
+        } else if ($input->size() < $this->config['n_samples']) {
+            $padLength = $this->config['n_samples'] - $input->size();
+            $padding = Tensor::zeros([$padLength], dtype: $input->dtype());
+            $input = Tensor::concat([$input, $padding]);
         }
 
         $features = Audio::spectrogram(
-            $waveform,
+            $input,
             $this->window,
             frameLength: $this->config['n_fft'],
             hopLength: $this->config['hop_length'],
 
@@ -60,7 +60,7 @@ function encoderDecoderPrepareInputsForGeneration(PretrainedModel $model, $input
 
     public function encoderForward(PretrainedModel $model, array $modelInputs): array
     {
-        $inputNames = array_column($model->session->inputs(), 'name');
+        $inputNames = array_column($model->sessions['encoder']->inputs(), 'name');
 
         $encoderFeeds = array_pick($modelInputs, $inputNames);
 
@@ -78,7 +78,7 @@ public function encoderForward(PretrainedModel $model, array $modelInputs): arra
             $encoderFeeds['token_type_ids'] ??= Tensor::zerosLike($encoderFeeds['input_ids']);
         }
 
-        return $model->runSession($model->session, $encoderFeeds);
+        return $model->runSession($model->sessions['encoder'], $encoderFeeds);
     }
 
     function decoderPrepareInputsForGeneration(PretrainedModel $model, $inputIds, array $modelInputs): array
@@ -121,9 +121,9 @@ function decoderPrepareInputsForGeneration(PretrainedModel $model, $inputIds, ar
         return $modelInputs;
     }
 
-    protected function decoderForward(PretrainedModel $model, array $modelInputs, $isEncoderDecoder = false): array
+    protected function decoderForward(PretrainedModel $model, array $modelInputs): array
     {
-        $session = $isEncoderDecoder ? $model->decoderMergedSession : $model->session;
+        $session = $model->sessions['decoder'];
 
         $inputNames = array_column($session->inputs(), 'name');
 
@@ -152,45 +152,43 @@ protected function seq2seqForward(PretrainedModel $model, array $modelInputs): a
     {
         $decoderFeeds = $modelInputs;
         $encoderOutputs = array_pop_key($decoderFeeds, 'encoder_outputs');
-        $inputIds = array_pop_key($decoderFeeds, 'input_ids');
         $decoderInputIds = array_pop_key($decoderFeeds, 'decoder_input_ids');
 
-
-
-        // Encode if needed
         if (!$encoderOutputs) {
-            $inputNames = array_column($model->session->inputs(), 'name');
-            // Pick necessary encoder inputs
+            $inputNames = array_column($model->sessions['encoder']->inputs(), 'name');
             $encoderInputs = array_pick($modelInputs, $inputNames);
-            // Encoder outputs are not given, so we must compute them
             $encoderOutputs = $this->encoderForward($model, $encoderInputs)['last_hidden_state'];
         }
 
-        // Set decoder input ids and encoder hidden states
         $decoderFeeds['input_ids'] = $decoderInputIds;
         $decoderFeeds['encoder_hidden_states'] = $encoderOutputs;
 
-        $inputNames = array_column($model->decoderMergedSession->inputs(), 'name');
+        $inputNames = array_column($model->sessions['decoder']->inputs(), 'name');
 
         if (in_array('encoder_attention_mask', $inputNames)) {
             $decoderFeeds['encoder_attention_mask'] = $modelInputs['attention_mask'];
         }
 
-        return $this->decoderForward($model, $decoderFeeds, true);
+        return $this->decoderForward($model, $decoderFeeds);
     }
 
+    /**
+     * Create position IDs based on the attention mask.
+     * 
+     * @param array{input_ids: Tensor, inputs_embeds: Tensor, attention_mask: Tensor} $modelInputs
+     * @param array|null $pastKeyValues
+     * @return Tensor
+     */
     protected function createPositionIds(array $modelInputs, ?array $pastKeyValues = null): Tensor
     {
         $inputIds = $modelInputs['input_ids'] ?? null;
         $inputsEmbeds = $modelInputs['inputs_embeds'] ?? null;
-        /** @var Tensor $attentionMask */
         $attentionMask = $modelInputs['attention_mask'];
 
         [$batchSize, $seqLen] = $attentionMask->shape();
 
         $data = array_fill(0, $attentionMask->size(), 0);
 
-        // Compute position IDs based on the attention mask
         for ($i = 0; $i < $batchSize; ++$i) {
             $start = $i * $seqLen;
             $sum = 0;
Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@ public function __construct(protected string $filename) {}`
`16`	`16`
`17`	`17`	`public function log($level, $message, array $context = []): void`
`18`	`18`	`{`
`19`		`- $line = sprintf("[%s][%s] %s %s\n", date('Y-m-d H:i:s'), strtoupper($level), $message, empty($context) ? '' : json_encode($context));`
	`19`	`+ $line = sprintf("[%s][%s] %s %s\n", date('Y-m-d H:i:s'), strtoupper($level), $message, empty($context) ? '' : json_encode($context, JSON_UNESCAPED_SLASHES));`
`20`	`20`	`file_put_contents($this->filename, $line, FILE_APPEND);`
`21`	`21`	`}`
`22`	`22`	`}`
Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ class FeatureExtractor`
`12`	`12`	`{`
`13`	`13`	`public function __construct(public array $config) {}`
`14`	`14`
`15`		`- public function __invoke(mixed $input, ...$args)`
	`15`	`+ public function __invoke($input, ...$args)`
`16`	`16`	`{`
`17`	`17`	`return $input;`
`18`	`18`	`}`