Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 17 additions & 7 deletions src/main/java/com/mindee/input/LocalInputSource.java
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
package com.mindee.input;

import com.mindee.image.ImageCompressor;
import com.mindee.pdf.PDFBoxApi;
import com.mindee.pdf.PDFCompressor;
import com.mindee.pdf.PDFOperation;
import com.mindee.pdf.PDFUtils;
import com.mindee.pdf.PdfBoxApi;
import com.mindee.pdf.PdfCompressor;
import com.mindee.pdf.PdfOperation;
import com.mindee.pdf.SplitQuery;
import java.io.File;
import java.io.IOException;
Expand All @@ -13,16 +13,20 @@
import java.nio.file.Path;
import java.util.Base64;
import lombok.Getter;
import lombok.Setter;
import org.apache.pdfbox.io.IOUtils;

/**
* A source document for Mindee API operations.
*/
@Getter
public final class LocalInputSource {

@Getter
private byte[] file;
@Getter
private final String filename;
@Setter
private PDFOperation pdfOperation;

public LocalInputSource(InputStream file, String filename) throws IOException {
this.file = IOUtils.toByteArray(file);
Expand Down Expand Up @@ -55,6 +59,13 @@ public LocalInputSource(String fileAsBase64, String filename) {
this.filename = filename;
}

public PDFOperation getPdfOperation() {
if (this.pdfOperation == null) {
this.pdfOperation = new PDFBoxApi();
}
return this.pdfOperation;
}

/**
* Get the number of pages in the document.
*
Expand All @@ -76,8 +87,7 @@ public int getPageCount() throws IOException {
*/
public void applyPageOptions(PageOptions pageOptions) throws IOException {
if (pageOptions != null && this.isPdf()) {
PdfOperation pdfOperation = new PdfBoxApi();
this.file = pdfOperation.split(new SplitQuery(this.file, pageOptions)).getFile();
this.file = getPdfOperation().split(new SplitQuery(this.file, pageOptions)).getFile();
}
}

Expand All @@ -97,7 +107,7 @@ public void compress(
Boolean disableSourceText
) throws IOException {
if (isPdf()) {
this.file = PdfCompressor.compressPdf(this.file, quality, forceSourceText, disableSourceText);
this.file = PDFCompressor.compressPdf(this.file, quality, forceSourceText, disableSourceText);
} else {
this.file = ImageCompressor.compressImage(this.file, quality, maxWidth, maxHeight);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,11 @@
import com.mindee.MindeeException;
import com.mindee.input.InputSourceUtils;
import com.mindee.input.LocalInputSource;
import com.mindee.v1.product.invoicesplitter.InvoiceSplitterV1InvoicePageGroup;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;
import javax.imageio.ImageIO;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
Expand All @@ -24,33 +21,23 @@
/**
* PDF extraction class.
*/
public class PDFExtractor {
private final PDDocument sourcePdf;
private final String filename;

/**
* Init from a path.
*
* @param filePath Path to the file.
* @throws IOException Throws if the file can't be accessed.
*/
public PDFExtractor(String filePath) throws IOException {
this(new LocalInputSource(filePath));
}
public class BasePDFExtractor {
protected final PDDocument sourcePdf;
protected final String filename;

/**
* Init from a {@link LocalInputSource}.
*
* @param source The local source.
* @throws IOException Throws if the file can't be accessed.
*/
public PDFExtractor(LocalInputSource source) throws IOException {
protected BasePDFExtractor(LocalInputSource source) throws IOException {
this.filename = source.getFilename();
if (source.isPdf()) {
this.sourcePdf = Loader.loadPDF(source.getFile());
} else {
PDDocument document = new PDDocument();
PDPage page = new PDPage();
var document = new PDDocument();
var page = new PDPage();
document.addPage(page);
BufferedImage bufferedImage = byteArrayToBufferedImage(source.getFile());
PDImageXObject pdImage = LosslessFactory.createFromImage(document, bufferedImage);
Expand All @@ -65,7 +52,6 @@ public PDFExtractor(LocalInputSource source) throws IOException {
);
}
this.sourcePdf = document;

}
}

Expand Down Expand Up @@ -101,7 +87,7 @@ public static BufferedImage byteArrayToBufferedImage(byte[] byteArray) throws IO
public List<ExtractedPDF> extractSubDocuments(
List<List<Integer>> pageIndexes
) throws IOException {
List<ExtractedPDF> extractedPDFs = new ArrayList<>();
var extractedPDFs = new ArrayList<ExtractedPDF>();

for (List<Integer> pageIndexElement : pageIndexes) {
if (pageIndexElement.isEmpty()) {
Expand All @@ -126,65 +112,4 @@ public List<ExtractedPDF> extractSubDocuments(
}
return extractedPDFs;
}

/**
* Extract invoices from the given page indexes (from an invoice-splitter prediction).
*
* @param pageIndexes List of page indexes.
* @return a list of extracted files.
* @throws IOException Throws if the file can't be accessed.
*/
public List<ExtractedPDF> extractInvoices(
List<InvoiceSplitterV1InvoicePageGroup> pageIndexes
) throws IOException {

List<List<Integer>> indexes = pageIndexes
.stream()
.map(InvoiceSplitterV1InvoicePageGroup::getPageIndexes)
.collect(Collectors.toList());

return extractSubDocuments(indexes);
}

/**
* Extract invoices from the given page indexes (from an invoice-splitter prediction).
*
* @param pageIndexes List of page indexes.
* @param strict Whether the extraction should strictly follow the confidence scores or not.
* @return a list of extracted files.
* @throws IOException Throws if the file can't be accessed.
*/
public List<ExtractedPDF> extractInvoices(
List<InvoiceSplitterV1InvoicePageGroup> pageIndexes,
boolean strict
) throws IOException {
List<List<Integer>> correctPageIndexes = new ArrayList<>();
if (!strict) {
return extractInvoices(pageIndexes);
}
Iterator<InvoiceSplitterV1InvoicePageGroup> iterator = pageIndexes.iterator();
List<Integer> currentList = new ArrayList<>();
Double previousConfidence = null;
while (iterator.hasNext()) {
InvoiceSplitterV1InvoicePageGroup pageIndex = iterator.next();
Double confidence = pageIndex.getConfidence();
List<Integer> pageList = pageIndex.getPageIndexes();

if (confidence == 1.0 && previousConfidence == null) {
currentList = new ArrayList<>(pageList);
} else if (confidence == 1.0) {
correctPageIndexes.add(currentList);
currentList = new ArrayList<>(pageList);
} else if (confidence == 0.0 && !iterator.hasNext()) {
currentList.addAll(pageList);
correctPageIndexes.add(currentList);
} else {
correctPageIndexes.add(currentList);
correctPageIndexes.add(pageList);
}
previousConfidence = confidence;
}
return extractSubDocuments(correctPageIndexes);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -11,32 +11,30 @@
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;

/**
* Allows performing various operations on PDFs.
*/
public final class PdfBoxApi implements PdfOperation {
public final class PDFBoxApi implements PDFOperation {

@Override
public SplitPdf split(SplitQuery splitQuery) throws IOException {
public SplitPDF split(SplitQuery splitQuery) throws IOException {

if (!checkPdfOpen(splitQuery.getFile())) {
throw new MindeeException("This document cannot be open and cannot be split.");
}

try (PDDocument originalDocument = Loader.loadPDF(splitQuery.getFile())) {
try (PDDocument splitDocument = new PDDocument()) {
try (var originalDocument = Loader.loadPDF(splitQuery.getFile())) {
try (var splitDocument = new PDDocument()) {
int totalOriginalPages = countPages(splitQuery.getFile());

if (totalOriginalPages < splitQuery.getPageOptions().getOnMinPages()) {
return new SplitPdf(splitQuery.getFile(), totalOriginalPages);
return new SplitPDF(splitQuery.getFile(), totalOriginalPages);
}

List<Integer> pageRange = getPageRanges(splitQuery.getPageOptions(), totalOriginalPages);

var pageRange = getPageRanges(splitQuery.getPageOptions(), totalOriginalPages);
pageRange
.stream()
.filter(i -> i < totalOriginalPages)
Expand All @@ -45,7 +43,7 @@ public SplitPdf split(SplitQuery splitQuery) throws IOException {
try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) {
splitDocument.save(outputStream);
byte[] splitPdf = outputStream.toByteArray();
return new SplitPdf(splitPdf, countPages(splitPdf));
return new SplitPDF(splitPdf, countPages(splitPdf));
}
}
}
Expand All @@ -55,12 +53,12 @@ private List<Integer> getPageRanges(PageOptions pageOptions, Integer numberOfPag

Set<Integer> pages = Optional
.ofNullable(pageOptions.getPageIndexes())
.map(Collection::stream)
.orElseGet(Stream::empty)
.stream()
.flatMap(Collection::stream)
.filter(x -> x > (numberOfPages) * (-1) && x <= (numberOfPages - 1))
.map(x -> (numberOfPages + x) % numberOfPages)
.collect(Collectors.toSet());
List<Integer> allPages = IntStream.range(0, numberOfPages).boxed().collect(Collectors.toList());
var allPages = IntStream.range(0, numberOfPages).boxed().collect(Collectors.toList());

switch (pageOptions.getOperation()) {
case KEEP_ONLY:
Expand All @@ -85,9 +83,6 @@ private boolean checkPdfOpen(byte[] documentFile) {
}

private int countPages(byte[] documentFile) throws IOException {
PDDocument document = Loader.loadPDF(documentFile);
int pageCount = document.getNumberOfPages();
document.close();
return pageCount;
return PDFUtils.getNumberOfPages(documentFile);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,13 @@
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;

/**
* PDF compression class.
*/
public class PdfCompressor {
public class PDFCompressor {
public static byte[] compressPdf(
byte[] pdfData,
Integer imageQuality,
Expand All @@ -44,10 +43,10 @@ public static byte[] compressPdf(
}
try (PDDocument inputDoc = Loader.loadPDF(pdfData); PDDocument outputDoc = new PDDocument()) {

PDFRenderer pdfRenderer = new PDFRenderer(inputDoc);
var pdfRenderer = new PDFRenderer(inputDoc);

for (int pageIndex = 0; pageIndex < inputDoc.getNumberOfPages(); pageIndex++) {
PDPage originalPage = inputDoc.getPage(pageIndex);
var originalPage = inputDoc.getPage(pageIndex);
PDRectangle originalPageSize = originalPage.getMediaBox();

processPage(
Expand Down Expand Up @@ -92,12 +91,12 @@ private static void processPage(
PDRectangle originalPageSize,
Boolean disableSourceText
) throws IOException {
PDPage newPage = new PDPage(originalPageSize);
var newPage = new PDPage(originalPageSize);
outputDoc.addPage(newPage);

PDImageXObject pdImage = JPEGFactory.createFromImage(outputDoc, image, imageQuality);
var pdImage = JPEGFactory.createFromImage(outputDoc, image, imageQuality);

try (PDPageContentStream contentStream = new PDPageContentStream(outputDoc, newPage)) {
try (var contentStream = new PDPageContentStream(outputDoc, newPage)) {
PDFUtils.addImageToPage(contentStream, pdImage, originalPageSize);
PDFUtils.extractAndAddText(originalDocument, contentStream, pageIndex, disableSourceText);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
/**
* Minimum PDF operations.
*/
public interface PdfOperation {
public interface PDFOperation {

/**
* Split a PDF file.
*
* @param splitQuery Options to perform the query.
* @return The split PDF.
*/
SplitPdf split(SplitQuery splitQuery) throws IOException;
SplitPDF split(SplitQuery splitQuery) throws IOException;
}
10 changes: 5 additions & 5 deletions src/main/java/com/mindee/pdf/PDFUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ private static byte[] createPdfFromExistingPdf(
List<Integer> pageNumbers,
boolean closeOriginal
) throws IOException {
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
PDDocument newDocument = new PDDocument();
var outputStream = new ByteArrayOutputStream();
var newDocument = new PDDocument();
int pageCount = document.getNumberOfPages();
pageNumbers
.stream()
Expand Down Expand Up @@ -161,7 +161,7 @@ public static List<PdfPageImage> pdfToImages(String filePath) throws IOException
*/
public static List<PdfPageImage> pdfToImages(LocalInputSource source) throws IOException {
PDDocument document = Loader.loadPDF(source.getFile());
PDFRenderer pdfRenderer = new PDFRenderer(document);
var pdfRenderer = new PDFRenderer(document);
List<PdfPageImage> pdfPageImages = new ArrayList<>();
for (int i = 0; i < document.getNumberOfPages(); i++) {
BufferedImage imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
Expand Down Expand Up @@ -201,7 +201,7 @@ public static PdfPageImage pdfPageToImage(
) throws IOException {
int index = pageNumber - 1;
PDDocument document = Loader.loadPDF(source.getFile());
PDFRenderer pdfRenderer = new PDFRenderer(document);
var pdfRenderer = new PDFRenderer(document);
BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer);
document.close();
return new PdfPageImage(imageBuffer, index, source.getFilename(), "jpg");
Expand All @@ -226,7 +226,7 @@ private static BufferedImage pdfPageToImageBuffer(
}

public static byte[] documentToBytes(PDDocument document) throws IOException {
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
var outputStream = new ByteArrayOutputStream();
document.save(outputStream);
return outputStream.toByteArray();
}
Expand Down
Loading
Loading