From 4189b0a691589a84634b5031379dfd3151841a4b Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Sat, 31 Jan 2026 05:24:30 +0500 Subject: [PATCH 1/3] Fix DataFilter false positives for UUIDs and MongoDB ObjectIds (#518) * Initial plan * Fix DataFilter to not filter UUIDs and MongoDB ObjectIds Co-authored-by: neSpecc <3684889+neSpecc@users.noreply.github.com> * Address code review feedback: improve UUID regex and test coverage Co-authored-by: neSpecc <3684889+neSpecc@users.noreply.github.com> * Fix tests to use values that would actually fail without UUID/ObjectId detection Co-authored-by: neSpecc <3684889+neSpecc@users.noreply.github.com> * upd version --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: neSpecc <3684889+neSpecc@users.noreply.github.com> Co-authored-by: Peter Savchenko --- package.json | 2 +- workers/grouper/package.json | 2 +- workers/grouper/src/data-filter.ts | 26 +++++ workers/grouper/tests/data-filter.test.ts | 119 ++++++++++++++++++++++ 4 files changed, 147 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index 506d732d..a053c5b8 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "hawk.workers", "private": true, - "version": "0.1.1", + "version": "0.1.2", "description": "Hawk workers", "repository": "git@github.com:codex-team/hawk.workers.git", "license": "BUSL-1.1", diff --git a/workers/grouper/package.json b/workers/grouper/package.json index 6e98edc1..294322a5 100644 --- a/workers/grouper/package.json +++ b/workers/grouper/package.json @@ -1,6 +1,6 @@ { "name": "hawk-worker-grouper", - "version": "0.0.1", + "version": "0.0.2", "description": "Accepts processed errors from language-workers and saves it to the DB with grouping of similar ones. ", "main": "src/index.ts", "repository": "https://github.com/codex-team/hawk.workers/tree/master/workers/grouper", diff --git a/workers/grouper/src/data-filter.ts b/workers/grouper/src/data-filter.ts index 40a4acf9..3571a1c6 100644 --- a/workers/grouper/src/data-filter.ts +++ b/workers/grouper/src/data-filter.ts @@ -54,6 +54,16 @@ export default class DataFilter { */ private bankCardRegex = /^(?:4[0-9]{12}(?:[0-9]{3})?|[25][1-7][0-9]{14}|6(?:011|5[0-9][0-9])[0-9]{12}|3[47][0-9]{13}|3(?:0[0-5]|[68][0-9])[0-9]{11}|(?:2131|1800|35\d{3})\d{11})$/g; + /** + * MongoDB ObjectId Regex (24 hexadecimal characters) + */ + private objectIdRegex = /^[0-9a-fA-F]{24}$/; + + /** + * UUID Regex - matches UUIDs with all dashes (8-4-4-4-12 format) or no dashes (32 hex chars) + */ + private uuidRegex = /^(?:[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}|[0-9a-fA-F]{32})$/; + /** * Accept event and process 'addons' and 'context' fields. * It mutates the original object @@ -96,6 +106,22 @@ export default class DataFilter { return value; } + /** + * Check if value matches MongoDB ObjectId pattern (24 hex chars) + * ObjectIds should not be filtered + */ + if (this.objectIdRegex.test(value)) { + return value; + } + + /** + * Check if value matches UUID pattern (with or without dashes) + * UUIDs should not be filtered + */ + if (this.uuidRegex.test(value)) { + return value; + } + /** * Remove all non-digit chars */ diff --git a/workers/grouper/tests/data-filter.test.ts b/workers/grouper/tests/data-filter.test.ts index d0a4c3af..28ff2979 100644 --- a/workers/grouper/tests/data-filter.test.ts +++ b/workers/grouper/tests/data-filter.test.ts @@ -143,5 +143,124 @@ describe('GrouperWorker', () => { expect(event.context['normalKey']).toBe(normalValue); expect(event.addons['vue']['props']['normalKey']).toBe(normalValue); }); + + test('should not filter UUID values that contain exactly 16 digits', async () => { + // These UUIDs contain exactly 16 digits, which when cleaned match PAN patterns + // Without UUID detection, they would be incorrectly filtered as credit cards + const uuidWithManyDigits = '4a1b2c3d-4e5f-6a7b-8c9d-0e1f2a3b4c5d'; // Cleans to 16 digits starting with 4 + const uuidUpperCase = '5A1B2C3D-4E5F-6A7B-8C9D-0E1F2A3B4C5D'; // Cleans to 16 digits starting with 5 + const uuidNoDashes = '2a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d'; // 32 hex chars without dashes + + const event = generateEvent({ + context: { + userId: uuidWithManyDigits, + sessionId: uuidUpperCase, + transactionId: uuidNoDashes, + }, + addons: { + vue: { + props: { + componentId: uuidWithManyDigits, + }, + }, + }, + }); + + dataFilter.processEvent(event); + + expect(event.context['userId']).toBe(uuidWithManyDigits); + expect(event.context['sessionId']).toBe(uuidUpperCase); + expect(event.context['transactionId']).toBe(uuidNoDashes); + expect(event.addons['vue']['props']['componentId']).toBe(uuidWithManyDigits); + }); + + test('should not filter MongoDB ObjectId values that contain exactly 16 digits', async () => { + // These ObjectIds contain exactly 16 digits which when cleaned match PAN patterns + // Without ObjectId detection, they would be incorrectly filtered as credit cards + const objectIdWithManyDigits = '4111111111111111abcdefab'; // 16 digits + 8 hex letters = 24 chars, cleans to Visa pattern + const objectIdUpperCase = '5111111111111111ABCDEFAB'; // Cleans to Mastercard pattern + const objectIdMixedCase = '2111111111111111AbCdEfAb'; // Cleans to Maestro/Mastercard pattern + + const event = generateEvent({ + context: { + projectId: objectIdWithManyDigits, + workspaceId: objectIdUpperCase, + transactionId: objectIdMixedCase, + }, + addons: { + hawk: { + projectId: objectIdWithManyDigits, + }, + }, + }); + + dataFilter.processEvent(event); + + expect(event.context['projectId']).toBe(objectIdWithManyDigits); + expect(event.context['workspaceId']).toBe(objectIdUpperCase); + expect(event.context['transactionId']).toBe(objectIdMixedCase); + expect(event.addons['hawk']['projectId']).toBe(objectIdWithManyDigits); + }); + + test('should still filter actual PAN numbers with formatting characters', async () => { + // Test real Mastercard test number with spaces and dashes + const panWithSpaces = '5500 0000 0000 0004'; + const panWithDashes = '5500-0000-0000-0004'; + + const event = generateEvent({ + context: { + cardNumber: panWithSpaces, + paymentCard: panWithDashes, + }, + }); + + dataFilter.processEvent(event); + + expect(event.context['cardNumber']).toBe('[filtered]'); + expect(event.context['paymentCard']).toBe('[filtered]'); + }); + + test('should not filter values that are not UUIDs, ObjectIds, or PANs', async () => { + // These are edge cases that should NOT be filtered + const shortHex = '507f1f77bcf86cd7'; // 16 hex chars (not 24) + const longNumber = '67280841958304100309082499'; // 26 digits (too long for PAN) + const mixedAlphaNum = 'abc123def456ghi789'; // Mixed content + + const event = generateEvent({ + context: { + shortId: shortHex, + longId: longNumber, + mixedId: mixedAlphaNum, + }, + }); + + dataFilter.processEvent(event); + + expect(event.context['shortId']).toBe(shortHex); + expect(event.context['longId']).toBe(longNumber); + expect(event.context['mixedId']).toBe(mixedAlphaNum); + }); + + test('should filter UUIDs and ObjectIds when they are in sensitive key fields', async () => { + // Even if the value is a valid UUID or ObjectId, it should be filtered + // if the key name is in the sensitive keys list + const uuid = '550e8400-e29b-41d4-a716-446655440000'; + const objectId = '507f1f77bcf86cd799439011'; + + const event = generateEvent({ + context: { + password: uuid, + secret: objectId, + auth: '672808419583041003090824', + }, + }); + + dataFilter.processEvent(event); + + // All should be filtered because of sensitive key names + expect(event.context['password']).toBe('[filtered]'); + expect(event.context['secret']).toBe('[filtered]'); + expect(event.context['auth']).toBe('[filtered]'); + }); }); }); From c4cc417ab4fb9a148c54789920ee304267c39f42 Mon Sep 17 00:00:00 2001 From: Dobrunia Kostrigin <48620984+Dobrunia@users.noreply.github.com> Date: Mon, 2 Feb 2026 20:31:22 +0300 Subject: [PATCH 2/3] fix(grouper): filter oldPassword and newPassword in event payload (#516) * fix(grouper): filter oldPassword and newPassword in event payload * chore: add more keys and update tests * chore: lint fix * fix(tests): rename sessionId to requestId in data-filter tests for clarity --- workers/grouper/src/data-filter.ts | 70 +++++++++++++++++++--- workers/grouper/tests/data-filter.test.ts | 73 +++++++++++++++++++++-- 2 files changed, 132 insertions(+), 11 deletions(-) diff --git a/workers/grouper/src/data-filter.ts b/workers/grouper/src/data-filter.ts index 3571a1c6..7e00038c 100644 --- a/workers/grouper/src/data-filter.ts +++ b/workers/grouper/src/data-filter.ts @@ -36,17 +36,71 @@ export default class DataFilter { private filteredValuePlaceholder = '[filtered]'; /** - * Possibly sensitive keys + * Possibly sensitive keys (lowercase; keys are compared via key.toLowerCase()) */ private possiblySensitiveDataKeys = new Set([ - 'pan', - 'secret', - 'credentials', - 'card[number]', - 'password', + /** + * Authorization and sessions + */ 'auth', + 'authorization', 'access_token', 'accesstoken', + 'token', + 'jwt', + 'session', + 'sessionid', + 'session_id', + /** + * API keys and secure tokens + */ + 'api_key', + 'apikey', + 'x-api-key', + 'x-auth-token', + 'bearer', + 'client_secret', + 'secret', + 'credentials', + /** + * Passwords + */ + 'password', + 'passwd', + 'mysql_pwd', + 'oldpassword', + 'old-password', + 'old_password', + 'newpassword', + 'new-password', + 'new_password', + /** + * Encryption keys + */ + 'private_key', + 'ssh_key', + /** + * Payments data + */ + 'card', + 'cardnumber', + 'card[number]', + 'creditcard', + 'credit_card', + 'pan', + 'pin', + 'security_code', + 'stripetoken', + 'cloudpayments_public_id', + 'cloudpayments_secret', + /** + * Config and connections + */ + 'dsn', + /** + * Personal data + */ + 'ssn', ]); /** @@ -127,7 +181,9 @@ export default class DataFilter { */ const clean = value.replace(/\D/g, ''); - // Reset last index to 0 + /** + * Reset last index to 0 + */ this.bankCardRegex.lastIndex = 0; if (!this.bankCardRegex.test(clean)) { return value; diff --git a/workers/grouper/tests/data-filter.test.ts b/workers/grouper/tests/data-filter.test.ts index 28ff2979..4cb98807 100644 --- a/workers/grouper/tests/data-filter.test.ts +++ b/workers/grouper/tests/data-filter.test.ts @@ -28,20 +28,57 @@ function generateEvent({ context, addons }: {context?: Json, addons?: EventAddon } /** - * Example of object with sensitive information + * Example of object with sensitive information. + * Keys intentionally use snake_case/kebab-case to match data-filter list. */ +/* eslint-disable @typescript-eslint/naming-convention */ const sensitiveDataMock = { pan: '5500 0000 0000 0004', secret: 'D6A03F5C2E0E356F262D56F44370E1CD813583B2', credentials: '70BA33708CBFB103F1A8E34AFEF333BA7DC021022B2D9AAA583AABB8058D8D67', 'card[number]': '5500 0000 0000 0004', password: 'bFb7PBm6nZ7RJRq9', + oldpassword: 'oldSecret123', + newpassword: 'newSecret456', + 'old-password': 'oldSecretHyphen', + old_password: 'oldSecretUnderscore', + 'new-password': 'newSecretHyphen', + new_password: 'newSecretUnderscore', auth: 'C4CA4238A0B923820DCC509A6F75849B', - // eslint-disable-next-line @typescript-eslint/naming-convention access_token: '70BA33708CBFB103F1A8E34AFEF333BA7DC021022B2D9AAA583AABB8058D8D67', accessToken: '70BA33708CBFB103F1A8E34AFEF333BA7DC021022B2D9AAA583AABB8058D8D67', }; +/** + * Additional sensitive keys (newly added / previously uncovered). + * Keys intentionally use snake_case to match data-filter list. + */ +const additionalSensitiveDataMock = { + authorization: 'Bearer abc123', + token: 'token-value', + jwt: 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9', + session: 'sess_xyz', + session_id: 'sid_789', + api_key: 'sk_live_xxx', + bearer: 'Bearer token', + client_secret: 'client_secret_value', + passwd: 'passwd_value', + mysql_pwd: 'mysql_pwd_value', + private_key: '-----BEGIN PRIVATE KEY-----', + ssh_key: 'ssh-rsa AAAA...', + card: '4111111111111111', + cardnumber: '5500000000000004', + creditcard: '4111111111111111', + pin: '1234', + security_code: '999', + stripetoken: 'tok_xxx', + cloudpayments_public_id: 'pk_xxx', + cloudpayments_secret: 'secret_xxx', + dsn: 'postgres://user:pass@host/db', + ssn: '123-45-6789', +}; +/* eslint-enable @typescript-eslint/naming-convention */ + describe('GrouperWorker', () => { const dataFilter = new DataFilter(); @@ -123,6 +160,34 @@ describe('GrouperWorker', () => { }); }); + test('should filter additional sensitive keys (authorization, token, payment, dsn, ssn, etc.) in context', async () => { + const event = generateEvent({ + context: additionalSensitiveDataMock, + }); + + dataFilter.processEvent(event); + + Object.keys(additionalSensitiveDataMock).forEach((key) => { + expect(event.context[key]).toBe('[filtered]'); + }); + }); + + test('should filter additional sensitive keys in addons', async () => { + const event = generateEvent({ + addons: { + vue: { + props: additionalSensitiveDataMock, + }, + }, + }); + + dataFilter.processEvent(event); + + Object.keys(additionalSensitiveDataMock).forEach((key) => { + expect(event.addons['vue']['props'][key]).toBe('[filtered]'); + }); + }); + test('should not replace values with keynames not in a list', async () => { const normalValue = 'test123'; const event = generateEvent({ @@ -154,7 +219,7 @@ describe('GrouperWorker', () => { const event = generateEvent({ context: { userId: uuidWithManyDigits, - sessionId: uuidUpperCase, + requestId: uuidUpperCase, transactionId: uuidNoDashes, }, addons: { @@ -169,7 +234,7 @@ describe('GrouperWorker', () => { dataFilter.processEvent(event); expect(event.context['userId']).toBe(uuidWithManyDigits); - expect(event.context['sessionId']).toBe(uuidUpperCase); + expect(event.context['requestId']).toBe(uuidUpperCase); expect(event.context['transactionId']).toBe(uuidNoDashes); expect(event.addons['vue']['props']['componentId']).toBe(uuidWithManyDigits); }); From 81b0d272d6df2faf7c762b84aa5f917bf2a7e416 Mon Sep 17 00:00:00 2001 From: Taly Date: Tue, 10 Feb 2026 18:52:51 +0300 Subject: [PATCH 3/3] Limit path depth and add cache cleanup (#523) * Limit path depth and add cache cleanup Prevent excessive memory usage and reduce GC pressure by limiting traversal path depth in data-filter (cap depth logic at 20). In grouper worker, add a periodic cache cleanup interval (every 5 minutes) started on worker start and cleared on finish to avoid unbounded cache growth. Free large references after delta computation by nulling event payloads to allow garbage collection. Also tighten memoization for findSimilarEvent (max reduced from 200 to 50 and ttl set to 600s) to further limit memory retained by caches. * Prevent deep-path allocations; tune timeouts & tests Limit path growth in data filter to avoid creating new arrays past 20 levels (reduces excessive allocations for deeply nested objects). Import TimeMs and replace magic numbers: set MEMOIZATION_TTL to 600_000, use TimeMs.MINUTE for cache cleanup interval, and apply MEMOIZATION_TTL to the memoize decorator. Clear large delta references by setting them to undefined to aid GC. Add a test that verifies filtering works on objects nested >20 levels without causing excessive memory allocations. * Use named constants for traversal and cache interval Introduce MAX_TRAVERSAL_DEPTH in data-filter.ts and replace the hardcoded depth check (20) to prevent excessive memory allocations from deep object nesting. Add CACHE_CLEANUP_INTERVAL_MINUTES in index.ts and use it for the cache cleanup setInterval instead of the literal 5, improving readability and making these tuning values easier to adjust. * Move eslint ignore to MEMOIZATION_TTL Remove the unnecessary eslint-disable-next-line on the memoize import and apply the no-unused-vars ignore to the MEMOIZATION_TTL constant instead. This ensures the linter suppression targets the unused constant (decorators not counted) rather than the import, improving clarity. * Suppress no-unused-vars for memoize import Add an explanatory comment and an `/* eslint-disable-next-line no-unused-vars */` directive before the `memoize` import in workers/grouper/src/index.ts. This prevents ESLint from flagging the import as unused since decorators (which rely on the import) are not recognized as usages by the linter. --- workers/grouper/src/data-filter.ts | 12 ++++++- workers/grouper/src/index.ts | 42 +++++++++++++++++++++-- workers/grouper/tests/data-filter.test.ts | 34 ++++++++++++++++++ 3 files changed, 85 insertions(+), 3 deletions(-) diff --git a/workers/grouper/src/data-filter.ts b/workers/grouper/src/data-filter.ts index 7e00038c..2345b8e5 100644 --- a/workers/grouper/src/data-filter.ts +++ b/workers/grouper/src/data-filter.ts @@ -1,6 +1,11 @@ import type { EventAddons, EventData } from '@hawk.so/types'; import { unsafeFields } from '../../../lib/utils/unsafeFields'; +/** + * Maximum depth for object traversal to prevent excessive memory allocations + */ +const MAX_TRAVERSAL_DEPTH = 20; + /** * Recursively iterate through object and call function on each key * @@ -18,7 +23,12 @@ function forAll(obj: Record, callback: (path: string[], key: st if (!(typeof value === 'object' && !Array.isArray(value))) { callback(path, key, current); } else { - visit(value, [...path, key]); + /** + * Limit path depth to prevent excessive memory allocations from deep nesting + * This reduces GC pressure and memory usage for deeply nested objects + */ + const newPath = path.length < MAX_TRAVERSAL_DEPTH ? path.concat(key) : path; + visit(value, newPath); } } }; diff --git a/workers/grouper/src/index.ts b/workers/grouper/src/index.ts index 73f16fc7..542c701f 100644 --- a/workers/grouper/src/index.ts +++ b/workers/grouper/src/index.ts @@ -19,11 +19,16 @@ import type { RepetitionDBScheme } from '../types/repetition'; import { DatabaseReadWriteError, DiffCalculationError, ValidationError } from '../../../lib/workerErrors'; import { decodeUnsafeFields, encodeUnsafeFields } from '../../../lib/utils/unsafeFields'; import { MS_IN_SEC } from '../../../lib/utils/consts'; +import TimeMs from '../../../lib/utils/time'; import DataFilter from './data-filter'; import RedisHelper from './redisHelper'; import { computeDelta } from './utils/repetitionDiff'; import { rightTrim } from '../../../lib/utils/string'; import { hasValue } from '../../../lib/utils/hasValue'; + +/** + * eslint does not count decorators as a variable usage + */ /* eslint-disable-next-line no-unused-vars */ import { memoize } from '../../../lib/memoize'; @@ -31,7 +36,12 @@ import { memoize } from '../../../lib/memoize'; * eslint does not count decorators as a variable usage */ /* eslint-disable-next-line no-unused-vars */ -const MEMOIZATION_TTL = Number(process.env.MEMOIZATION_TTL ?? 0); +const MEMOIZATION_TTL = 600_000; + +/** + * Cache cleanup interval in minutes + */ +const CACHE_CLEANUP_INTERVAL_MINUTES = 5; /** * Error code of MongoDB key duplication error @@ -72,6 +82,11 @@ export default class GrouperWorker extends Worker { */ private redis = new RedisHelper(); + /** + * Interval for periodic cache cleanup to prevent memory leaks from unbounded cache growth + */ + private cacheCleanupInterval: NodeJS.Timeout | null = null; + /** * Start consuming messages */ @@ -85,6 +100,15 @@ export default class GrouperWorker extends Worker { await this.redis.initialize(); console.log('redis initialized'); + + /** + * Start periodic cache cleanup to prevent memory leaks from unbounded cache growth + * Runs every 5 minutes to clear old cache entries + */ + this.cacheCleanupInterval = setInterval(() => { + this.clearCache(); + }, CACHE_CLEANUP_INTERVAL_MINUTES * TimeMs.MINUTE); + await super.start(); } @@ -92,6 +116,14 @@ export default class GrouperWorker extends Worker { * Finish everything */ public async finish(): Promise { + /** + * Clear cache cleanup interval to prevent resource leaks + */ + if (this.cacheCleanupInterval) { + clearInterval(this.cacheCleanupInterval); + this.cacheCleanupInterval = null; + } + await super.finish(); this.prepareCache(); await this.eventsDb.close(); @@ -237,6 +269,12 @@ export default class GrouperWorker extends Worker { } as RepetitionDBScheme; repetitionId = await this.saveRepetition(task.projectId, newRepetition); + + /** + * Clear the large event payload references to allow garbage collection + * This prevents memory leaks from retaining full event objects after delta is computed + */ + delta = undefined; } /** @@ -334,7 +372,7 @@ export default class GrouperWorker extends Worker { * @param projectId - where to find * @param title - title of the event to find similar one */ - @memoize({ max: 200, ttl: MEMOIZATION_TTL, strategy: 'hash', skipCache: [undefined] }) + @memoize({ max: 50, ttl: MEMOIZATION_TTL, strategy: 'hash', skipCache: [undefined] }) private async findSimilarEvent(projectId: string, title: string): Promise { /** * If no match by Levenshtein, try matching by patterns diff --git a/workers/grouper/tests/data-filter.test.ts b/workers/grouper/tests/data-filter.test.ts index 4cb98807..2f00dd68 100644 --- a/workers/grouper/tests/data-filter.test.ts +++ b/workers/grouper/tests/data-filter.test.ts @@ -327,5 +327,39 @@ describe('GrouperWorker', () => { expect(event.context['secret']).toBe('[filtered]'); expect(event.context['auth']).toBe('[filtered]'); }); + + test('should handle deeply nested objects (>20 levels) without excessive memory allocations', () => { + // Create an object nested deeper than the cap (>20 levels) + let deeplyNested: any = { value: 'leaf', secret: 'should-be-filtered' }; + + for (let i = 0; i < 25; i++) { + deeplyNested = { [`level${i}`]: deeplyNested, password: `sensitive${i}` }; + } + + const event = generateEvent({ + context: deeplyNested, + }); + + // This should not throw or cause memory issues + dataFilter.processEvent(event); + + // Verify that filtering still works at various depths + expect(event.context['password']).toBe('[filtered]'); + + // Navigate to a mid-level and check filtering + let current = event.context['level24'] as any; + for (let i = 24; i > 15; i--) { + expect(current['password']).toBe('[filtered]'); + current = current[`level${i - 1}`]; + } + + // At the leaf level, the secret should still be filtered + // (though path tracking may be capped, filtering should still work) + let leaf = event.context; + for (let i = 24; i >= 0; i--) { + leaf = leaf[`level${i}`] as any; + } + expect(leaf['secret']).toBe('[filtered]'); + }); }); });