Skip to content

Commit dbee20e

Browse files
committed
cleanup
1 parent ecf39c5 commit dbee20e

File tree

6 files changed

+232
-26
lines changed

6 files changed

+232
-26
lines changed

apps/docs/content/docs/en/tools/textract.mdx

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ In Sim, the AWS Textract integration empowers your agents to intelligently proce
2727

2828
## Usage Instructions
2929

30-
Integrate AWS Textract into your workflow to extract text, tables, forms, and key-value pairs from documents. Sync mode supports JPEG, PNG, and single-page PDF. Async mode supports multi-page PDF and TIFF via S3.
30+
Integrate AWS Textract into your workflow to extract text, tables, forms, and key-value pairs from documents. Single-page mode supports JPEG, PNG, and single-page PDF. Multi-page mode supports multi-page PDF and TIFF.
3131

3232

3333

@@ -45,8 +45,8 @@ Parse documents using AWS Textract OCR and document analysis
4545
| `secretAccessKey` | string | Yes | AWS Secret Access Key |
4646
| `region` | string | Yes | AWS region for Textract service \(e.g., us-east-1\) |
4747
| `processingMode` | string | No | Document type: single-page or multi-page. Defaults to single-page. |
48-
| `filePath` | string | No | URL to a document to be processed \(JPEG, PNG, PDF, or TIFF\). Required for sync mode. |
49-
| `s3Uri` | string | No | S3 URI for async processing \(s3://bucket/key\). Required for async mode with S3 input. |
48+
| `filePath` | string | No | URL to a document to be processed \(JPEG, PNG, or single-page PDF\). |
49+
| `s3Uri` | string | No | S3 URI for multi-page processing \(s3://bucket/key\). |
5050
| `fileUpload` | object | No | File upload data from file-upload component |
5151
| `featureTypes` | array | No | Feature types to detect: TABLES, FORMS, QUERIES, SIGNATURES, LAYOUT. If not specified, only text detection is performed. |
5252
| `items` | string | No | Feature type |

apps/sim/app/api/tools/textract/parse/route.ts

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,6 @@ const TextractParseSchema = z
4949
}
5050
})
5151

52-
/**
53-
* Generate AWS Signature Version 4 signing key
54-
*/
5552
function getSignatureKey(
5653
key: string,
5754
dateStamp: string,
@@ -309,7 +306,7 @@ export async function POST(request: NextRequest) {
309306
return NextResponse.json(
310307
{
311308
success: false,
312-
error: 'S3 URI or file path is required for async processing',
309+
error: 'S3 URI or file path is required for multi-page processing',
313310
},
314311
{ status: 400 }
315312
)
@@ -338,7 +335,7 @@ export async function POST(request: NextRequest) {
338335
return NextResponse.json(
339336
{
340337
success: false,
341-
error: 'Async mode requires an S3 URI (s3://bucket/key) or an uploaded file',
338+
error: 'Multi-page mode requires an S3 URI (s3://bucket/key) or an uploaded file',
342339
},
343340
{ status: 400 }
344341
)
@@ -425,7 +422,7 @@ export async function POST(request: NextRequest) {
425422
return NextResponse.json(
426423
{
427424
success: false,
428-
error: 'File path is required for sync processing',
425+
error: 'File path is required for single-page processing',
429426
},
430427
{ status: 400 }
431428
)

apps/sim/blocks/blocks/textract.ts

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ export const TextractBlock: BlockConfig<TextractParserOutput> = {
77
name: 'AWS Textract',
88
description: 'Extract text, tables, and forms from documents',
99
authMode: AuthMode.ApiKey,
10-
longDescription: `Integrate AWS Textract into your workflow to extract text, tables, forms, and key-value pairs from documents. Sync mode supports JPEG, PNG, and single-page PDF. Async mode supports multi-page PDF and TIFF via S3.`,
10+
longDescription: `Integrate AWS Textract into your workflow to extract text, tables, forms, and key-value pairs from documents. Single-page mode supports JPEG, PNG, and single-page PDF. Multi-page mode supports multi-page PDF and TIFF.`,
1111
docsLink: 'https://docs.sim.ai/tools/textract',
1212
category: 'tools',
1313
bgColor: 'linear-gradient(135deg, #055F4E 0%, #56C0A7 100%)',
@@ -180,7 +180,7 @@ export const TextractBlock: BlockConfig<TextractParserOutput> = {
180180
const asyncInputMethod = params.asyncInputMethod || 's3'
181181
if (asyncInputMethod === 's3') {
182182
if (!params.s3Uri || params.s3Uri.trim() === '') {
183-
throw new Error('S3 URI is required for async processing')
183+
throw new Error('S3 URI is required for multi-page processing')
184184
}
185185
parameters.s3Uri = params.s3Uri.trim()
186186
} else if (asyncInputMethod === 'upload') {
@@ -220,12 +220,12 @@ export const TextractBlock: BlockConfig<TextractParserOutput> = {
220220
},
221221
inputs: {
222222
processingMode: { type: 'string', description: 'Document type: single-page or multi-page' },
223-
inputMethod: { type: 'string', description: 'Input method selection for sync mode' },
224-
asyncInputMethod: { type: 'string', description: 'Input method selection for async mode' },
223+
inputMethod: { type: 'string', description: 'Input method selection for single-page mode' },
224+
asyncInputMethod: { type: 'string', description: 'Input method selection for multi-page mode' },
225225
filePath: { type: 'string', description: 'Document URL' },
226-
s3Uri: { type: 'string', description: 'S3 URI for async processing (s3://bucket/key)' },
227-
fileUpload: { type: 'json', description: 'Uploaded document file for sync mode' },
228-
asyncFileUpload: { type: 'json', description: 'Uploaded document file for async mode' },
226+
s3Uri: { type: 'string', description: 'S3 URI for multi-page processing (s3://bucket/key)' },
227+
fileUpload: { type: 'json', description: 'Uploaded document file for single-page mode' },
228+
asyncFileUpload: { type: 'json', description: 'Uploaded document file for multi-page mode' },
229229
extractTables: { type: 'boolean', description: 'Extract tables from document' },
230230
extractForms: { type: 'boolean', description: 'Extract form key-value pairs' },
231231
detectSignatures: { type: 'boolean', description: 'Detect signatures' },

apps/sim/lib/core/security/input-validation.test.ts

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import {
44
createPinnedUrl,
55
validateAirtableId,
66
validateAlphanumericId,
7+
validateAwsRegion,
78
validateEnum,
89
validateExternalUrl,
910
validateFileExtension,
@@ -17,6 +18,7 @@ import {
1718
validateNumericId,
1819
validatePathSegment,
1920
validateProxyUrl,
21+
validateS3BucketName,
2022
validateUrlWithDNS,
2123
} from '@/lib/core/security/input-validation'
2224
import { sanitizeForLogging } from '@/lib/core/security/redaction'
@@ -1192,3 +1194,216 @@ describe('validateAirtableId', () => {
11921194
})
11931195
})
11941196
})
1197+
1198+
describe('validateAwsRegion', () => {
1199+
describe('valid regions', () => {
1200+
it.concurrent('should accept us-east-1', () => {
1201+
const result = validateAwsRegion('us-east-1')
1202+
expect(result.isValid).toBe(true)
1203+
expect(result.sanitized).toBe('us-east-1')
1204+
})
1205+
1206+
it.concurrent('should accept us-west-2', () => {
1207+
const result = validateAwsRegion('us-west-2')
1208+
expect(result.isValid).toBe(true)
1209+
})
1210+
1211+
it.concurrent('should accept eu-west-1', () => {
1212+
const result = validateAwsRegion('eu-west-1')
1213+
expect(result.isValid).toBe(true)
1214+
})
1215+
1216+
it.concurrent('should accept ap-southeast-1', () => {
1217+
const result = validateAwsRegion('ap-southeast-1')
1218+
expect(result.isValid).toBe(true)
1219+
})
1220+
1221+
it.concurrent('should accept sa-east-1', () => {
1222+
const result = validateAwsRegion('sa-east-1')
1223+
expect(result.isValid).toBe(true)
1224+
})
1225+
1226+
it.concurrent('should accept me-south-1', () => {
1227+
const result = validateAwsRegion('me-south-1')
1228+
expect(result.isValid).toBe(true)
1229+
})
1230+
1231+
it.concurrent('should accept af-south-1', () => {
1232+
const result = validateAwsRegion('af-south-1')
1233+
expect(result.isValid).toBe(true)
1234+
})
1235+
1236+
it.concurrent('should accept regions with double-digit numbers', () => {
1237+
const result = validateAwsRegion('ap-northeast-12')
1238+
expect(result.isValid).toBe(true)
1239+
})
1240+
})
1241+
1242+
describe('invalid regions', () => {
1243+
it.concurrent('should reject null', () => {
1244+
const result = validateAwsRegion(null)
1245+
expect(result.isValid).toBe(false)
1246+
expect(result.error).toContain('required')
1247+
})
1248+
1249+
it.concurrent('should reject empty string', () => {
1250+
const result = validateAwsRegion('')
1251+
expect(result.isValid).toBe(false)
1252+
expect(result.error).toContain('required')
1253+
})
1254+
1255+
it.concurrent('should reject uppercase regions', () => {
1256+
const result = validateAwsRegion('US-EAST-1')
1257+
expect(result.isValid).toBe(false)
1258+
})
1259+
1260+
it.concurrent('should reject invalid format - missing number', () => {
1261+
const result = validateAwsRegion('us-east')
1262+
expect(result.isValid).toBe(false)
1263+
})
1264+
1265+
it.concurrent('should reject invalid format - wrong separators', () => {
1266+
const result = validateAwsRegion('us_east_1')
1267+
expect(result.isValid).toBe(false)
1268+
})
1269+
1270+
it.concurrent('should reject invalid format - too many parts', () => {
1271+
const result = validateAwsRegion('us-east-1-extra')
1272+
expect(result.isValid).toBe(false)
1273+
})
1274+
1275+
it.concurrent('should reject path traversal attempts', () => {
1276+
const result = validateAwsRegion('../etc/passwd')
1277+
expect(result.isValid).toBe(false)
1278+
})
1279+
1280+
it.concurrent('should reject arbitrary strings', () => {
1281+
const result = validateAwsRegion('not-a-region')
1282+
expect(result.isValid).toBe(false)
1283+
})
1284+
1285+
it.concurrent('should use custom param name in errors', () => {
1286+
const result = validateAwsRegion('', 'awsRegion')
1287+
expect(result.error).toContain('awsRegion')
1288+
})
1289+
})
1290+
})
1291+
1292+
describe('validateS3BucketName', () => {
1293+
describe('valid bucket names', () => {
1294+
it.concurrent('should accept simple bucket name', () => {
1295+
const result = validateS3BucketName('my-bucket')
1296+
expect(result.isValid).toBe(true)
1297+
expect(result.sanitized).toBe('my-bucket')
1298+
})
1299+
1300+
it.concurrent('should accept bucket name with numbers', () => {
1301+
const result = validateS3BucketName('bucket123')
1302+
expect(result.isValid).toBe(true)
1303+
})
1304+
1305+
it.concurrent('should accept bucket name with periods', () => {
1306+
const result = validateS3BucketName('my.bucket.name')
1307+
expect(result.isValid).toBe(true)
1308+
})
1309+
1310+
it.concurrent('should accept 3 character bucket name', () => {
1311+
const result = validateS3BucketName('abc')
1312+
expect(result.isValid).toBe(true)
1313+
})
1314+
1315+
it.concurrent('should accept 63 character bucket name', () => {
1316+
const result = validateS3BucketName('a'.repeat(63))
1317+
expect(result.isValid).toBe(true)
1318+
})
1319+
1320+
it.concurrent('should accept minimum valid bucket name (3 chars)', () => {
1321+
const result = validateS3BucketName('a1b')
1322+
expect(result.isValid).toBe(true)
1323+
})
1324+
})
1325+
1326+
describe('invalid bucket names - null/empty', () => {
1327+
it.concurrent('should reject null', () => {
1328+
const result = validateS3BucketName(null)
1329+
expect(result.isValid).toBe(false)
1330+
expect(result.error).toContain('required')
1331+
})
1332+
1333+
it.concurrent('should reject empty string', () => {
1334+
const result = validateS3BucketName('')
1335+
expect(result.isValid).toBe(false)
1336+
expect(result.error).toContain('required')
1337+
})
1338+
})
1339+
1340+
describe('invalid bucket names - length', () => {
1341+
it.concurrent('should reject 2 character bucket name', () => {
1342+
const result = validateS3BucketName('ab')
1343+
expect(result.isValid).toBe(false)
1344+
expect(result.error).toContain('between 3 and 63')
1345+
})
1346+
1347+
it.concurrent('should reject 64 character bucket name', () => {
1348+
const result = validateS3BucketName('a'.repeat(64))
1349+
expect(result.isValid).toBe(false)
1350+
expect(result.error).toContain('between 3 and 63')
1351+
})
1352+
})
1353+
1354+
describe('invalid bucket names - format', () => {
1355+
it.concurrent('should reject uppercase letters', () => {
1356+
const result = validateS3BucketName('MyBucket')
1357+
expect(result.isValid).toBe(false)
1358+
})
1359+
1360+
it.concurrent('should reject underscores', () => {
1361+
const result = validateS3BucketName('my_bucket')
1362+
expect(result.isValid).toBe(false)
1363+
})
1364+
1365+
it.concurrent('should reject starting with hyphen', () => {
1366+
const result = validateS3BucketName('-mybucket')
1367+
expect(result.isValid).toBe(false)
1368+
})
1369+
1370+
it.concurrent('should reject ending with hyphen', () => {
1371+
const result = validateS3BucketName('mybucket-')
1372+
expect(result.isValid).toBe(false)
1373+
})
1374+
1375+
it.concurrent('should reject starting with period', () => {
1376+
const result = validateS3BucketName('.mybucket')
1377+
expect(result.isValid).toBe(false)
1378+
})
1379+
1380+
it.concurrent('should reject ending with period', () => {
1381+
const result = validateS3BucketName('mybucket.')
1382+
expect(result.isValid).toBe(false)
1383+
})
1384+
1385+
it.concurrent('should reject consecutive periods', () => {
1386+
const result = validateS3BucketName('my..bucket')
1387+
expect(result.isValid).toBe(false)
1388+
expect(result.error).toContain('consecutive periods')
1389+
})
1390+
1391+
it.concurrent('should reject IP address format', () => {
1392+
const result = validateS3BucketName('192.168.1.1')
1393+
expect(result.isValid).toBe(false)
1394+
expect(result.error).toContain('IP address')
1395+
})
1396+
1397+
it.concurrent('should reject special characters', () => {
1398+
const result = validateS3BucketName('my@bucket')
1399+
expect(result.isValid).toBe(false)
1400+
})
1401+
})
1402+
1403+
describe('error messages', () => {
1404+
it.concurrent('should use custom param name in errors', () => {
1405+
const result = validateS3BucketName('', 's3Bucket')
1406+
expect(result.error).toContain('s3Bucket')
1407+
})
1408+
})
1409+
})

apps/sim/tools/textract/parser.ts

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,13 @@ export const textractParserTool: ToolConfig<TextractParserInput, TextractParserO
4040
type: 'string',
4141
required: false,
4242
visibility: 'user-only',
43-
description:
44-
'URL to a document to be processed (JPEG, PNG, PDF, or TIFF). Required for sync mode.',
43+
description: 'URL to a document to be processed (JPEG, PNG, or single-page PDF).',
4544
},
4645
s3Uri: {
4746
type: 'string',
4847
required: false,
4948
visibility: 'user-only',
50-
description:
51-
'S3 URI for async processing (s3://bucket/key). Required for async mode with S3 input.',
49+
description: 'S3 URI for multi-page processing (s3://bucket/key).',
5250
},
5351
fileUpload: {
5452
type: 'object',
@@ -144,13 +142,13 @@ export const textractParserTool: ToolConfig<TextractParserInput, TextractParserO
144142
if (uploadedFilePath.startsWith('/api/files/serve/')) {
145143
requestBody.filePath = uploadedFilePath
146144
} else {
147-
throw new Error('Async mode with upload requires files stored in S3')
145+
throw new Error('Multi-page mode with upload requires files stored in S3')
148146
}
149147
} else {
150148
throw new Error('Invalid file upload: Upload data is missing or invalid')
151149
}
152150
} else {
153-
throw new Error('Async mode requires either an S3 URI or an uploaded file')
151+
throw new Error('Multi-page mode requires either an S3 URI or an uploaded file')
154152
}
155153
} else {
156154
if (

apps/sim/tools/textract/types.ts

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,17 +70,14 @@ export interface TextractBlock {
7070
}
7171
}
7272

73-
/** AWS Textract DocumentMetadata - exact API format */
7473
export interface TextractDocumentMetadataRaw {
7574
Pages: number
7675
}
7776

78-
/** Normalized DocumentMetadata (camelCase) */
7977
export interface TextractDocumentMetadata {
8078
pages: number
8179
}
8280

83-
/** AWS Textract API Response - exact API format */
8481
export interface TextractApiResponse {
8582
Blocks: TextractBlock[]
8683
DocumentMetadata: TextractDocumentMetadataRaw
@@ -94,7 +91,6 @@ export interface TextractNormalizedOutput {
9491
modelVersion?: string
9592
}
9693

97-
/** Async job status response from Textract */
9894
export interface TextractAsyncJobResponse {
9995
JobStatus: 'IN_PROGRESS' | 'SUCCEEDED' | 'FAILED' | 'PARTIAL_SUCCESS'
10096
StatusMessage?: string

0 commit comments

Comments
 (0)