diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000000000..acd7e480fc916 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,204 @@ +## Overview + +PDF.js is a Portable Document Format (PDF) viewer built with JavaScript, HTML5 Canvas, and CSS. It's a Mozilla project that provides a general-purpose, web standards-based platform for parsing and rendering PDFs without requiring native code or plugins. + +## Common Commands + +### Development Server +```bash +npx gulp server +``` +Then open http://localhost:8888/web/viewer.html to view the PDF viewer. Test PDFs are available at http://localhost:8888/test/pdfs/?frame + +### Building + +Build for modern browsers: +```bash +npx gulp generic +``` + +This generates `pdf.js` and `pdf.worker.js` in `build/generic/build/`. + +Build for distribution (creates pdfjs-dist package): +```bash +npx gulp dist +npx gulp dist-install # Build and install locally +``` + +### Testing + +Run all tests: +```bash +npx gulp test +``` + +Run unit tests only: +```bash +npx gulp unittest +``` + +Run integration tests (browser-based tests using Puppeteer): +```bash +npx gulp integrationtest +``` + +Run font tests: +```bash +npx gulp fonttest +``` + +Run a single test file by modifying test/test_manifest.json or using test runner options. + +### Linting and Formatting + +Lint JavaScript: +```bash +npx gulp lint +``` + +Format code (uses Prettier and ESLint): +```bash +npx eslint --fix +``` + +### Type Checking + +Run TypeScript type checking: +```bash +npx gulp typestest +``` + +## Architecture + +### High-Level Structure + +PDF.js has a multi-layer architecture that separates concerns between PDF parsing, rendering, and UI: + +#### 1. Core Layer (`src/core/`) +The core layer handles PDF parsing and interpretation. Key responsibilities: +- **PDF parsing**: Parsing PDF structure, cross-reference tables, streams +- **Font handling**: CFF, TrueType, Type1 font parsing and conversion (`font.js`, `fonts.js`, `cff_*.js`, `type1_*.js`) +- **Image decoding**: JPEG, JBIG2, JPX/JPEG2000 decoders +- **Operators**: Processing PDF drawing operators (`operator_list.js`, `evaluator.js`) +- **XFA Forms**: XML Forms Architecture support (`src/core/xfa/`) +- **Color spaces**: ICC profiles, device color spaces (`colorspace.js`, `icc_colorspace.js`) +- Runs in a Web Worker for performance isolation + +Entry point: `src/pdf.worker.js` + +#### 2. Display Layer (`src/display/`) +The display layer provides the API for rendering PDFs to canvas and managing documents. Key components: +- **API**: Main public API (`api.js`) - `PDFDocumentProxy`, `PDFPageProxy`, `getDocument()` +- **Canvas rendering**: Renders PDF operations to HTML5 canvas (`canvas.js`) +- **Text layer**: Extracts and positions text for selection/search (`text_layer.js`) +- **Annotation layer**: Renders and handles PDF annotations (`annotation_layer.js`) +- **Editor layer**: Supports PDF editing (annotations, highlights, stamps) (`editor/`) +- **Metadata**: Parses XMP metadata (`metadata.js`) +- **Streams**: Handles PDF data fetching (fetch, network, node) (`fetch_stream.js`, `network.js`, `node_stream.js`) + +Entry point: `src/pdf.js` + +#### 3. Scripting Layer (`src/scripting_api/`) +Implements JavaScript execution for interactive PDFs (form calculations, validations, button actions). +- Sandboxed execution environment +- Implements Acrobat JavaScript API objects (App, Doc, Field, etc.) + +Entry points: `src/pdf.scripting.js`, `src/pdf.sandbox.js` + +#### 4. Web Viewer (`web/`) +The complete PDF viewer application with UI. Key components: +- **Main app**: Application orchestration (`app.js`) +- **Viewer**: Page rendering and layout (`pdf_viewer.js`, `pdf_page_view.js`) +- **Toolbar**: Zoom, page navigation, print, download controls +- **Sidebar**: Thumbnails, outlines, attachments (`pdf_sidebar.js`, `pdf_thumbnail_view.js`, `pdf_outline_viewer.js`) +- **Find controller**: Text search functionality (`pdf_find_controller.js`) +- **Annotation editors**: UI for creating/editing annotations (`annotation_editor_layer_builder.js`) +- **Presentation mode**: Full-screen presentation (`pdf_presentation_mode.js`) + +Entry point: `web/viewer.html` + `web/viewer.mjs` + +#### 5. Shared Utilities (`src/shared/`) +Common utilities used across layers: +- **Message handling**: Worker communication (`message_handler.js`) +- **Utilities**: Common functions and constants (`util.js`) +- **Image utilities**: Image processing helpers (`image_utils.js`) + +### Worker Communication + +PDF.js uses a Web Worker architecture: +- Main thread (`display` layer) communicates with worker thread (`core` layer) via `MessageHandler` +- Keeps PDF parsing off the main thread for better performance +- Messages include: page rendering requests, text content extraction, metadata queries + +### Build System + +- Uses **Gulp** for build orchestration (`gulpfile.mjs`) +- **Webpack** bundles modules into browser-compatible formats +- **Babel** transpiles for browser compatibility (configurable targets in gulpfile) +- Preprocessor replaces build-time constants (e.g., `typeof PDFJSDev !== "undefined"` checks) +- Multiple build targets: generic, components, minified, legacy (older browser support) + +### External Dependencies + +Located in `external/`: +- **bcmaps**: Binary CMaps for CJK fonts +- **standard_fonts**: Core 14 PDF fonts metrics +- **cmapscompress**: Tools for compressing CMaps +- **openjpeg**: JPEG2000 decoder (WASM) +- **quickjs**: JavaScript engine for sandboxed execution + +### Translations + +Translations in `l10n/` are imported from Mozilla Firefox Nightly. Only the file l10n/en-US/viewer.ftl can be updated. + +## Development Notes + +### Adding New Features + +When adding features that span multiple layers: +1. Start with the `core` layer if parsing/interpretation changes are needed +2. Update the `display` layer API if new capabilities need exposure +3. Modify the `web` viewer if UI changes are required +4. Ensure worker communication handles new message types + +### Preprocessor Directives + +Code uses preprocessor checks for build-time conditionals: +```javascript +if (typeof PDFJSDev !== "undefined" && PDFJSDev.test("GENERIC")) { + // Generic build-specific code +} +``` + +Common flags: `GENERIC`, `MOZCENTRAL`, `CHROME`, `MINIFIED`, `TESTING`, `LIB`, `SKIP_BABEL`, `IMAGE_DECODERS` + +### Testing + +- Unit tests use Jasmine framework (`test/unit/`) +- Integration tests use Puppeteer for browser automation (`test/integration/`) +- Test PDFs downloaded from manifest (`test/test_manifest.json`) +- Reference images for visual regression testing (`test/ref/`) + +### Code Style + +- Uses ESLint with custom configuration (`eslint.config.mjs`) +- Prettier for formatting +- Stylelint for CSS +- No semicolons required (ASI enabled) +- Single quotes for strings + +### Pull Request Process + +- Keep PRs focused on a single issue +- Provide a test PDF if the issue is PDF-specific +- Ensure tests pass (`npx gulp test`) +- Run linting (`npx gulp lint`) +- Follow existing code patterns +- Don't modify translations directly (they come from Firefox) + +### Performance Considerations + +- Core parsing runs in a Web Worker - keep main thread work minimal +- Canvas rendering can be expensive - use appropriate scale factors +- Text layer generation is separate from rendering - can be deferred +- Annotation layer is optional - only enable when needed diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000000000..43c994c2d3617 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +@AGENTS.md diff --git a/src/core/cff_parser.js b/src/core/cff_parser.js index 61dcfa2a5feb8..0e9a91be369de 100644 --- a/src/core/cff_parser.js +++ b/src/core/cff_parser.js @@ -119,8 +119,8 @@ const CharstringValidationData = [ /* 7 */ { id: "vlineto", min: 1, resetStack: true }, /* 8 */ { id: "rrcurveto", min: 6, resetStack: true }, /* 9 */ null, - /* 10 */ { id: "callsubr", min: 1, undefStack: true }, - /* 11 */ { id: "return", min: 0, undefStack: true }, + /* 10 */ { id: "callsubr", min: 1 }, + /* 11 */ { id: "return", min: 0 }, /* 12 */ null, /* 13 */ null, /* 14 */ { id: "endchar", min: 0, stackClearing: true }, @@ -138,7 +138,7 @@ const CharstringValidationData = [ /* 26 */ { id: "vvcurveto", min: 4, resetStack: true }, /* 27 */ { id: "hhcurveto", min: 4, resetStack: true }, /* 28 */ null, // shortint - /* 29 */ { id: "callgsubr", min: 1, undefStack: true }, + /* 29 */ { id: "callgsubr", min: 1 }, /* 30 */ { id: "vhcurveto", min: 4, resetStack: true }, /* 31 */ { id: "hvcurveto", min: 4, resetStack: true }, ]; @@ -627,26 +627,24 @@ class CFFParser { data[j - 1] = value === 1 ? 3 : 23; } } - if ("min" in validationCommand) { - if (!state.undefStack && stackSize < validationCommand.min) { - warn( - "Not enough parameters for " + - validationCommand.id + - "; actual: " + - stackSize + - ", expected: " + - validationCommand.min - ); - - if (stackSize === 0) { - // Just "fix" the outline in replacing command by a endchar: - // it could lead to wrong rendering of some glyphs or not. - // For example, the pdf in #6132 is well-rendered. - data[j - 1] = 14; - return true; - } - return false; + if (stackSize < validationCommand.min) { + warn( + "Not enough parameters for " + + validationCommand.id + + "; actual: " + + stackSize + + ", expected: " + + validationCommand.min + ); + + if (stackSize === 0) { + // Just "fix" the outline in replacing command by a endchar: + // it could lead to wrong rendering of some glyphs or not. + // For example, the pdf in #6132 is well-rendered. + data[j - 1] = 14; + return true; } + return false; } if (state.firstStackClearing && validationCommand.stackClearing) { state.firstStackClearing = false; @@ -670,15 +668,11 @@ class CFFParser { validationCommand.stackFn(stack, stackSize); } stackSize += validationCommand.stackDelta; - } else if (validationCommand.stackClearing) { - stackSize = 0; - } else if (validationCommand.resetStack) { - stackSize = 0; - state.undefStack = false; - } else if (validationCommand.undefStack) { + } else if ( + validationCommand.stackClearing || + validationCommand.resetStack + ) { stackSize = 0; - state.undefStack = true; - state.firstStackClearing = false; } } } @@ -706,7 +700,6 @@ class CFFParser { callDepth: 0, stackSize: 0, stack: [], - undefStack: true, hints: 0, firstStackClearing: true, seac: null, diff --git a/src/core/fonts.js b/src/core/fonts.js index 97ea14c70fd62..532896e0acbaa 100644 --- a/src/core/fonts.js +++ b/src/core/fonts.js @@ -2702,13 +2702,36 @@ class Font { writeUint32(tables.maxp.data, 0, version); } + let isGlyphLocationsLong = int16( + tables.head.data[50], + tables.head.data[51] + ); + if (tables.loca) { + const locaLength = isGlyphLocationsLong + ? (numGlyphs + 1) * 4 + : (numGlyphs + 1) * 2; + if (tables.loca.length !== locaLength) { + warn("Incorrect 'loca' table length -- attempting to fix it."); + // The length of the loca table is wrong (see #13425), so we check if we + // have enough space to fix it. + const sortedTables = Object.values(tables) + .filter(Boolean) + .sort((a, b) => a.offset - b.offset); + const locaIndex = sortedTables.indexOf(tables.loca); + const nextTable = sortedTables[locaIndex + 1] || null; + if (nextTable && tables.loca.offset + locaLength < nextTable.offset) { + const previousPos = font.pos; + font.pos = font.start || 0; + font.skip(tables.loca.offset); + tables.loca.data = font.getBytes(locaLength); + tables.loca.length = locaLength; + font.pos = previousPos; + } + } + } + if (properties.scaleFactors?.length === numGlyphs && isTrueType) { const { scaleFactors } = properties; - const isGlyphLocationsLong = int16( - tables.head.data[50], - tables.head.data[51] - ); - const glyphs = new GlyfTable({ glyfTable: tables.glyf.data, isGlyphLocationsLong, @@ -2723,7 +2746,7 @@ class Font { if (isLocationLong !== !!isGlyphLocationsLong) { tables.head.data[50] = 0; - tables.head.data[51] = isLocationLong ? 1 : 0; + isGlyphLocationsLong = tables.head.data[51] = isLocationLong ? 1 : 0; } const metrics = tables.hmtx.data; @@ -2801,10 +2824,6 @@ class Font { let missingGlyphs = Object.create(null); if (isTrueType) { - const isGlyphLocationsLong = int16( - tables.head.data[50], - tables.head.data[51] - ); const glyphsInfo = sanitizeGlyphLocations( tables.loca, tables.glyf, diff --git a/test/pdfs/issue13425.pdf.link b/test/pdfs/issue13425.pdf.link new file mode 100644 index 0000000000000..96cd322c69574 --- /dev/null +++ b/test/pdfs/issue13425.pdf.link @@ -0,0 +1 @@ +https://github.com/mozilla/pdf.js/files/6529459/20200927_204903_509.pdf diff --git a/test/test_manifest.json b/test/test_manifest.json index 4ea18bc5421fc..03f53f1394729 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -13115,5 +13115,13 @@ "md5": "b85c798b9a4cc2cd4337d335321cc612", "rounds": 1, "type": "eq" + }, + { + "id": "issue13425", + "file": "pdfs/issue13425.pdf", + "md5": "36854e6ad43b8e0446d3d64e8f2950bf", + "rounds": 1, + "link": true, + "type": "eq" } ] diff --git a/test/unit/autolinker_spec.js b/test/unit/autolinker_spec.js index a7755af671431..2063e07f8ef37 100644 --- a/test/unit/autolinker_spec.js +++ b/test/unit/autolinker_spec.js @@ -90,6 +90,10 @@ describe("autolinker", function () { ["partl@mail.boku.ac.at", "mailto:partl@mail.boku.ac.at"], ["Irene.Hyna@bmwf.ac.at", "mailto:Irene.Hyna@bmwf.ac.at"], ["", "mailto:hi@foo.bar.baz"], + [ + "foo@用户@例子.广告", + "mailto:%E7%94%A8%E6%88%B7@%E4%BE%8B%E5%AD%90.%E5%B9%BF%E5%91%8A", + ], ]); }); @@ -144,6 +148,7 @@ describe("autolinker", function () { "http//[]", // Empty IPv6 address. "abc.example.com", // URL without scheme. "JD?M$0QP)lKn06l1apKDC@\\qJ4B!!(5m+j.7F790m", // Not a valid email. + "262@0.302304", // Invalid domain. ].join("\n") ); expect(matches.length).toEqual(0); diff --git a/web/autolinker.js b/web/autolinker.js index 85b7e8e28b14e..a3045235eccc0 100644 --- a/web/autolinker.js +++ b/web/autolinker.js @@ -133,10 +133,12 @@ class Autolinker { static #regex; + static #numericTLDRegex; + static findLinks(text) { // Regex can be tested and verified at https://regex101.com/r/rXoLiT/2. this.#regex ??= - /\b(?:https?:\/\/|mailto:|www\.)(?:[\S--[\p{P}<>]]|\/|[\S--[\[\]]]+[\S--[\p{P}<>]])+|\b[\S--[@\p{Ps}\p{Pe}<>]]+@([\S--[\p{P}<>]]+(?:\.[\S--[\p{P}<>]]+)+)/gmv; + /\b(?:https?:\/\/|mailto:|www\.)(?:[\S--[\p{P}<>]]|\/|[\S--[\[\]]]+[\S--[\p{P}<>]])+|(?=\p{L})[\S--[@\p{Ps}\p{Pe}<>]]+@([\S--[\p{P}<>]]+(?:\.[\S--[\p{P}<>]]+)+)/gmv; const [normalizedText, diffs] = normalize(text, { ignoreDashEOL: true }); const matches = normalizedText.matchAll(this.#regex); @@ -150,11 +152,19 @@ class Autolinker { url.startsWith("https://") ) { raw = url; - } else if (URL.canParse(`http://${emailDomain}`)) { - raw = url.startsWith("mailto:") ? url : `mailto:${url}`; - } else { - continue; + } else if (emailDomain) { + const hostname = URL.parse(`http://${emailDomain}`)?.hostname; + if (!hostname) { + continue; + } + this.#numericTLDRegex ??= /\.\d+$/; + if (this.#numericTLDRegex.test(hostname)) { + // Skip emails with a numeric TLD as domain. + continue; + } } + raw ??= url.startsWith("mailto:") ? url : `mailto:${url}`; + const absoluteURL = createValidAbsoluteUrl(raw, null, { addDefaultProtocol: true, });