Skip to content

Commit e5e4a43

Browse files
refactor: unify model session handling and clean up feature extractors
- Use `$sessions` array in PretrainedModel instead of separate session properties - Remove redundant constructors in model subclasses - Minor logging and doc improvements in Audio and Image utils - Update .gitignore for log files
1 parent 688ef6c commit e5e4a43

19 files changed

+120
-250
lines changed

examples/.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
vendor
22
.transformers-cache/*
33
composer.lock
4-
paddleocr
4+
paddleocr
5+
*.log

examples/bootstrap.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ public function __construct(protected string $filename) {}
1616

1717
public function log($level, $message, array $context = []): void
1818
{
19-
$line = sprintf("[%s][%s] %s %s\n", date('Y-m-d H:i:s'), strtoupper($level), $message, empty($context) ? '' : json_encode($context));
19+
$line = sprintf("[%s][%s] %s %s\n", date('Y-m-d H:i:s'), strtoupper($level), $message, empty($context) ? '' : json_encode($context, JSON_UNESCAPED_SLASHES));
2020
file_put_contents($this->filename, $line, FILE_APPEND);
2121
}
2222
}

examples/misc/general-test.php

Lines changed: 0 additions & 21 deletions
This file was deleted.

examples/transformers.log

Whitespace-only changes.

src/FeatureExtractors/ASTFeatureExtractor.php

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,13 @@ public function __construct(array $config)
4646

4747
/**
4848
* Extracts features from a given audio using the provided configuration.
49-
* @param Tensor $waveform The audio tensor to extract features from.
49+
* @param Tensor $input The audio tensor to extract features from.
5050
* @return Tensor[] The extracted features.
5151
*/
52-
public function __invoke(Tensor $waveform): array
52+
public function __invoke($input, ...$args): array
5353
{
5454
$features = Audio::spectrogram(
55-
$waveform,
55+
$input,
5656
$this->window,
5757
frameLength: 400,
5858
hopLength: 160,
@@ -72,4 +72,4 @@ public function __invoke(Tensor $waveform): array
7272
'input_values' => $features->add(-$this->mean)->multiply(1 / $this->std)->unsqueeze(0)
7373
];
7474
}
75-
}
75+
}

src/FeatureExtractors/DetrFeatureExtractor.php

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,12 @@ class DetrFeatureExtractor extends ImageFeatureExtractor
1616
/**
1717
* Calls the feature extraction process on an array of images, preprocesses
1818
* each image, and concatenates the resulting features into a single Tensor.
19-
* @param Image|array $images The image(s) to extract features from.
19+
* @param Image|array $input The image(s) to extract features from.
2020
* @return array An object containing the concatenated pixel values of the preprocessed images.
2121
*/
22-
public function __invoke(Image|array $images, ...$args): array
22+
public function __invoke($input, ...$args): array
2323
{
24-
$result = parent::__invoke($images, $args);
25-
24+
$result = parent::__invoke($input, $args);
2625

2726
// TODO support differently-sized images, for now assume all images are the same size.
2827
// TODO support different mask sizes (not just 64x64)
@@ -49,4 +48,4 @@ public function postProcessObjectDetection(ObjectDetectionOutput $outputs, float
4948
{
5049
return Processor::postProcessObjectDetection($outputs, $threshold, $targetSizes, $isZeroShot);
5150
}
52-
}
51+
}

src/FeatureExtractors/FeatureExtractor.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ class FeatureExtractor
1212
{
1313
public function __construct(public array $config) {}
1414

15-
public function __invoke(mixed $input, ...$args)
15+
public function __invoke($input, ...$args)
1616
{
1717
return $input;
1818
}

src/FeatureExtractors/ImageFeatureExtractor.php

Lines changed: 14 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
use Codewithkyrian\Transformers\Tensor\Tensor;
99
use Codewithkyrian\Transformers\Utils\Image;
1010
use Exception;
11-
use Imagine\Image\Point;
1211

1312
class ImageFeatureExtractor extends FeatureExtractor
1413
{
@@ -78,61 +77,6 @@ public function __construct(public array $config)
7877
}
7978
}
8079

81-
82-
/**
83-
* Crops the margin of the image. Gray pixels are considered margin (i.e., pixels with a value below the threshold).
84-
*
85-
* @param int $grayThreshold Value below which pixels are considered to be gray.
86-
*
87-
* @return static The cropped image.
88-
*/
89-
public function cropMargin(Image $image, int $grayThreshold = 200): static
90-
{
91-
$grayImage = $image->clone()->grayscale();
92-
93-
// Get the min and max pixel values
94-
$minValue = min($grayImage->toTensor()->buffer())[0];
95-
$maxValue = max($grayImage->toTensor()->buffer())[0];
96-
97-
$diff = $maxValue - $minValue;
98-
99-
// If all pixels have the same value, no need to crop
100-
if ($diff === 0) {
101-
return $this;
102-
}
103-
104-
$threshold = $grayThreshold / 255;
105-
106-
$xMin = $image->width();
107-
$yMin = $image->height();
108-
$xMax = 0;
109-
$yMax = 0;
110-
111-
$width = $image->width();
112-
$height = $image->height();
113-
114-
// Iterate over each pixel in the image
115-
for ($y = 0; $y < $height; ++$y) {
116-
for ($x = 0; $x < $width; ++$x) {
117-
$color = $grayImage->image->getColorAt(new Point($x, $y));
118-
$pixelValue = $color->getRed(); // Assuming grayscale, so red channel is sufficient
119-
120-
if (($pixelValue - $minValue) / $diff < $threshold) {
121-
// We have a non-gray pixel, so update the min/max values accordingly
122-
$xMin = min($xMin, $x);
123-
$yMin = min($yMin, $y);
124-
$xMax = max($xMax, $x);
125-
$yMax = max($yMax, $y);
126-
}
127-
}
128-
}
129-
130-
// Crop the image using the calculated bounds
131-
$image->crop($xMin, $yMin, $xMax, $yMax);
132-
133-
return $this;
134-
}
135-
13680
/**
13781
* Pad the image by a certain amount.
13882
*
@@ -152,8 +96,7 @@ public function padImage(
15296
string $mode = 'constant',
15397
bool $center = false,
15498
int $constantValues = 0
155-
): Tensor
156-
{
99+
): Tensor {
157100
if ($tensorFormat === 'CHW') {
158101
[$imageChannels, $imageHeight, $imageWidth] = $imageTensor->shape();
159102
} else {
@@ -324,7 +267,7 @@ public function getResizeOutputImageSize(Image $image, int|array|null $size): ar
324267
} elseif ($this->sizeDivisibility != null) {
325268
return $this->enforceSizeDivisibility([$srcWidth, $srcHeight], $this->sizeDivisibility);
326269
} else {
327-
throw new Exception("Could not resize image due to unsupported 'size' parameter passed: ".json_encode($size));
270+
throw new Exception("Could not resize image due to unsupported 'size' parameter passed: " . json_encode($size));
328271
}
329272
}
330273

@@ -347,8 +290,7 @@ public function preprocess(
347290
?bool $doPad = null,
348291
?bool $doConvertRGB = null,
349292
?bool $doConvertGrayscale = null
350-
): array
351-
{
293+
): array {
352294
if ($this->doCropMargin) {
353295
// Specific to nougat processors. This is done before resizing,
354296
// and can be interpreted as a pre-preprocessing step.
@@ -400,7 +342,7 @@ public function preprocess(
400342
if ($doNormalize ?? $this->doNormalize) {
401343
if (is_array($this->imageMean)) {
402344
// Negate the mean values to add instead of subtract
403-
$negatedMean = array_map(fn ($mean) => -$mean, $this->imageMean);
345+
$negatedMean = array_map(fn($mean) => -$mean, $this->imageMean);
404346
$imageMean = Tensor::repeat($negatedMean, $image->height() * $image->width(), 1);
405347
} else {
406348
$imageMean = Tensor::fill([$image->channels * $image->height() * $image->width()], -$this->imageMean);
@@ -409,7 +351,7 @@ public function preprocess(
409351

410352
if (is_array($this->imageStd)) {
411353
// Inverse the standard deviation values to multiple instead of divide
412-
$inversedStd = array_map(fn ($std) => 1 / $std, $this->imageStd);
354+
$inversedStd = array_map(fn($std) => 1 / $std, $this->imageStd);
413355
$imageStd = Tensor::repeat($inversedStd, $image->height() * $image->width(), 1);
414356
} else {
415357
$imageStd = Tensor::fill([$image->channels * $image->height() * $image->width()], 1 / $this->imageStd);
@@ -421,7 +363,7 @@ public function preprocess(
421363
$imageStd = $imageStd->reshape($imageTensor->shape());
422364

423365
if (count($imageMean) !== $image->channels || count($imageStd) !== $image->channels) {
424-
throw new Exception("When set to arrays, the length of `imageMean` (".count($imageMean).") and `imageStd` (".count($imageStd).") must match the number of channels in the image ({$image->channels}).");
366+
throw new Exception("When set to arrays, the length of `imageMean` (" . count($imageMean) . ") and `imageStd` (" . count($imageStd) . ") must match the number of channels in the image ({$image->channels}).");
425367
}
426368

427369
// Normalize pixel data
@@ -450,15 +392,17 @@ public function preprocess(
450392
* preprocesses each image, and concatenates the resulting
451393
* features into a single Tensor.
452394
*
453-
* @param Image|Image[] $images The image(s) to extract features from.
395+
* @param Image|Image[] $input The image(s) to extract features from.
454396
* @param mixed ...$args Additional arguments.
455397
*
456398
* @return array An object containing the concatenated pixel values (and other metadata) of the preprocessed images.
457399
*/
458-
public function __invoke(Image|array $images, ...$args): array
400+
public function __invoke($input, ...$args): array
459401
{
460-
if (!is_array($images)) {
461-
$images = [$images];
402+
$images = is_array($input) ? $input : [$input];
403+
404+
if (count($images) === 0) {
405+
throw new Exception('No images provided');
462406
}
463407

464408
$imageData = array_map([$this, 'preprocess'], $images);
@@ -516,6 +460,6 @@ private function constraintToMultipleOf(int $val, int $multiple, int $minVal = 0
516460
$x = ceil($a) * $multiple;
517461
}
518462

519-
return $x;
463+
return (int)$x;
520464
}
521-
}
465+
}

src/FeatureExtractors/WhisperFeatureExtractor.php

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -32,26 +32,26 @@ public function __construct(array $config)
3232

3333
/**
3434
* Extracts features from a given audio using the provided configuration.
35-
* @param Tensor $waveform The audio tensor to extract features from.
35+
* @param Tensor $input The audio tensor to extract features from.
3636
* @return Tensor[] The extracted features.
3737
*/
38-
public function __invoke(Tensor $waveform): array
38+
public function __invoke($input, ...$args): array
3939
{
40-
if ($waveform->size() > $this->config['n_samples']) {
40+
if ($input->size() > $this->config['n_samples']) {
4141
$logger = Transformers::getLogger();
4242
$logger->warning('Attempting to extract features for audio longer than 30 seconds.' .
4343
'If using a pipeline to extract transcript from a long audio clip,' .
4444
'remember to specify `chunkLengthSecs` and/or `strideLengthSecs` in the pipeline options.');
4545

46-
$waveform = $waveform->sliceWithBounds([0], [$this->config['n_samples']]);
47-
} else if ($waveform->size() < $this->config['n_samples']) {
48-
$padLength = $this->config['n_samples'] - $waveform->size();
49-
$padding = Tensor::zeros([$padLength], dtype: $waveform->dtype());
50-
$waveform = Tensor::concat([$waveform, $padding]);
46+
$input = $input->sliceWithBounds([0], [$this->config['n_samples']]);
47+
} else if ($input->size() < $this->config['n_samples']) {
48+
$padLength = $this->config['n_samples'] - $input->size();
49+
$padding = Tensor::zeros([$padLength], dtype: $input->dtype());
50+
$input = Tensor::concat([$input, $padding]);
5151
}
5252

5353
$features = Audio::spectrogram(
54-
$waveform,
54+
$input,
5555
$this->window,
5656
frameLength: $this->config['n_fft'],
5757
hopLength: $this->config['hop_length'],

src/Models/ModelArchitecture.php

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ function encoderDecoderPrepareInputsForGeneration(PretrainedModel $model, $input
6060

6161
public function encoderForward(PretrainedModel $model, array $modelInputs): array
6262
{
63-
$inputNames = array_column($model->session->inputs(), 'name');
63+
$inputNames = array_column($model->sessions['encoder']->inputs(), 'name');
6464

6565
$encoderFeeds = array_pick($modelInputs, $inputNames);
6666

@@ -78,7 +78,7 @@ public function encoderForward(PretrainedModel $model, array $modelInputs): arra
7878
$encoderFeeds['token_type_ids'] ??= Tensor::zerosLike($encoderFeeds['input_ids']);
7979
}
8080

81-
return $model->runSession($model->session, $encoderFeeds);
81+
return $model->runSession($model->sessions['encoder'], $encoderFeeds);
8282
}
8383

8484
function decoderPrepareInputsForGeneration(PretrainedModel $model, $inputIds, array $modelInputs): array
@@ -121,9 +121,9 @@ function decoderPrepareInputsForGeneration(PretrainedModel $model, $inputIds, ar
121121
return $modelInputs;
122122
}
123123

124-
protected function decoderForward(PretrainedModel $model, array $modelInputs, $isEncoderDecoder = false): array
124+
protected function decoderForward(PretrainedModel $model, array $modelInputs): array
125125
{
126-
$session = $isEncoderDecoder ? $model->decoderMergedSession : $model->session;
126+
$session = $model->sessions['decoder'];
127127

128128
$inputNames = array_column($session->inputs(), 'name');
129129

@@ -152,45 +152,43 @@ protected function seq2seqForward(PretrainedModel $model, array $modelInputs): a
152152
{
153153
$decoderFeeds = $modelInputs;
154154
$encoderOutputs = array_pop_key($decoderFeeds, 'encoder_outputs');
155-
$inputIds = array_pop_key($decoderFeeds, 'input_ids');
156155
$decoderInputIds = array_pop_key($decoderFeeds, 'decoder_input_ids');
157156

158-
159-
160-
// Encode if needed
161157
if (!$encoderOutputs) {
162-
$inputNames = array_column($model->session->inputs(), 'name');
163-
// Pick necessary encoder inputs
158+
$inputNames = array_column($model->sessions['encoder']->inputs(), 'name');
164159
$encoderInputs = array_pick($modelInputs, $inputNames);
165-
// Encoder outputs are not given, so we must compute them
166160
$encoderOutputs = $this->encoderForward($model, $encoderInputs)['last_hidden_state'];
167161
}
168162

169-
// Set decoder input ids and encoder hidden states
170163
$decoderFeeds['input_ids'] = $decoderInputIds;
171164
$decoderFeeds['encoder_hidden_states'] = $encoderOutputs;
172165

173-
$inputNames = array_column($model->decoderMergedSession->inputs(), 'name');
166+
$inputNames = array_column($model->sessions['decoder']->inputs(), 'name');
174167

175168
if (in_array('encoder_attention_mask', $inputNames)) {
176169
$decoderFeeds['encoder_attention_mask'] = $modelInputs['attention_mask'];
177170
}
178171

179-
return $this->decoderForward($model, $decoderFeeds, true);
172+
return $this->decoderForward($model, $decoderFeeds);
180173
}
181174

175+
/**
176+
* Create position IDs based on the attention mask.
177+
*
178+
* @param array{input_ids: Tensor, inputs_embeds: Tensor, attention_mask: Tensor} $modelInputs
179+
* @param array|null $pastKeyValues
180+
* @return Tensor
181+
*/
182182
protected function createPositionIds(array $modelInputs, ?array $pastKeyValues = null): Tensor
183183
{
184184
$inputIds = $modelInputs['input_ids'] ?? null;
185185
$inputsEmbeds = $modelInputs['inputs_embeds'] ?? null;
186-
/** @var Tensor $attentionMask */
187186
$attentionMask = $modelInputs['attention_mask'];
188187

189188
[$batchSize, $seqLen] = $attentionMask->shape();
190189

191190
$data = array_fill(0, $attentionMask->size(), 0);
192191

193-
// Compute position IDs based on the attention mask
194192
for ($i = 0; $i < $batchSize; ++$i) {
195193
$start = $i * $seqLen;
196194
$sum = 0;

0 commit comments

Comments
 (0)