Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ gem "rqrcode"
gem "thruster"
gem "useragent", github: "basecamp/useragent"
gem "front_matter_parser"
gem "pdf-reader", "~> 2.12"

group :development, :test do
gem "debug"
Expand Down
12 changes: 12 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,12 @@ GIT
GEM
remote: https://rubygems.org/
specs:
Ascii85 (2.0.1)
action_text-trix (2.1.17)
railties
addressable (2.8.6)
public_suffix (>= 2.0.2, < 6.0)
afm (1.0.0)
ast (2.4.2)
base64 (0.3.0)
bcrypt (3.1.22)
Expand Down Expand Up @@ -146,6 +148,7 @@ GEM
front_matter_parser (1.0.1)
globalid (1.3.0)
activesupport (>= 6.1)
hashery (2.1.2)
i18n (1.14.8)
concurrent-ruby (~> 1.0)
image_processing (1.13.0)
Expand Down Expand Up @@ -213,6 +216,12 @@ GEM
parser (3.3.3.0)
ast (~> 2.4.1)
racc
pdf-reader (2.15.1)
Ascii85 (>= 1.0, < 3.0, != 2.0.0)
afm (>= 0.2.1, < 2)
hashery (~> 2.0)
ruby-rc4
ttfunk
pp (0.6.3)
prettyprint
prettyprint (0.2.0)
Expand Down Expand Up @@ -308,6 +317,7 @@ GEM
rubocop-performance
rubocop-rails
ruby-progressbar (1.13.0)
ruby-rc4 (0.1.5)
ruby-vips (2.2.2)
ffi (~> 1.12)
logger
Expand Down Expand Up @@ -345,6 +355,7 @@ GEM
tilt (2.4.0)
timeout (0.6.0)
tsort (0.2.0)
ttfunk (1.7.0)
turbo-rails (2.0.11)
actionpack (>= 6.0.0)
railties (>= 6.0.0)
Expand Down Expand Up @@ -384,6 +395,7 @@ DEPENDENCIES
image_processing (~> 1.13)
importmap-rails
jbuilder
pdf-reader (~> 2.12)
propshaft
puma (>= 5.0)
rails!
Expand Down
20 changes: 20 additions & 0 deletions app/controllers/books/imports_controller.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
class Books::ImportsController < ApplicationController
include BookScoped

before_action :ensure_editable

def create
imported_leaves = begin
PdfImporter.new(@book, params[:pdf]&.tempfile).import
rescue PdfImporter::InvalidPdfError, ArgumentError => e
Rails.logger.error("PdfImporter failed: #{e.class}: #{e.message}")
[]
end
Comment thread
albertski marked this conversation as resolved.

if imported_leaves.any?
redirect_to book_slug_url(@book)
else
redirect_to book_slug_url(@book), alert: "Could not import PDF."
end
end
end
9 changes: 9 additions & 0 deletions app/javascript/controllers/loading_controller.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import { Controller } from "@hotwired/stimulus"

export default class extends Controller {
static targets = [ "disable" ]

start() {
this.disableTargets.forEach(el => el.disabled = true)
}
}
36 changes: 36 additions & 0 deletions app/models/pdf_importer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
class PdfImporter
class InvalidPdfError < StandardError; end

def initialize(book, pdf_io)
raise InvalidPdfError, "No PDF file provided" if pdf_io.blank?
@book = book
@reader = PDF::Reader.new(pdf_io)
rescue PDF::Reader::MalformedPDFError, PDF::Reader::UnsupportedFeatureError => e
raise InvalidPdfError, e.message
end
Comment thread
albertski marked this conversation as resolved.

def import
ActiveRecord::Base.transaction do
pdf_pages.flat_map { |page| leaves_for(page) }
Comment thread
albertski marked this conversation as resolved.
end
end
Comment thread
albertski marked this conversation as resolved.

private
def pdf_pages
@reader.pages.each_with_index.filter_map do |page, index|
parsed = PdfPage.new(page, index + 1)
parsed unless parsed.blank?
end
end

def leaves_for(pdf_page)
leaves = []
leaves << @book.press(Page.new(body: pdf_page.body), title: pdf_page.title) if pdf_page.body.present?
pdf_page.pictures.each do |attachment|
picture = Picture.new
picture.image.attach(attachment)
leaves << @book.press(picture, title: pdf_page.title)
end
leaves
end
end
30 changes: 30 additions & 0 deletions app/models/pdf_page.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
class PdfPage
MIN_TITLE_LENGTH = 3
MAX_TITLE_LENGTH = 100

attr_reader :pictures

def initialize(page, page_number)
@page = page
@page_number = page_number
@text = page.text.strip
@pictures = PdfPicture.extract_from(page)
end

def blank?
@text.blank? && @pictures.empty?
end

def title
first_line = @text.lines.first&.strip
if first_line.present? && first_line.length.between?(MIN_TITLE_LENGTH, MAX_TITLE_LENGTH)
first_line
else
"Page #{@page_number}"
end
end

def body
@text.gsub(/\n{3,}/, "\n\n").strip
end
end
80 changes: 80 additions & 0 deletions app/models/pdf_picture.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
class PdfPicture
COLORSPACE_MAP = { DeviceRGB: "rgb", DeviceGray: "gray", DeviceCMYK: "cmyk" }.freeze

def self.extract_from(page)
new(page).extract
end

def initialize(page)
@source = page
end

def extract
@source.xobjects.flat_map do |name, stream|
stream.hash[:Subtype] == :Image ? [ attachment_for(stream, name) ].compact : []
end
rescue => e
Rails.logger.warn "PDF picture extraction failed: #{e.message}"
[]
Comment thread
albertski marked this conversation as resolved.
end

private
def attachment_for(stream, name)
filter = Array(stream.hash[:Filter]).first

case filter
when :DCTDecode then jpeg_attachment(stream, name)
when :JPXDecode then jp2_to_jpeg(stream, name)
else raw_to_png(stream, name)
end
rescue => e
Rails.logger.warn "PDF picture attachment failed for #{name}: #{e.message}"
nil
end

def jpeg_attachment(stream, name)
{ io: StringIO.new(stream.data), filename: "#{name}.jpg", content_type: "image/jpeg" }
end

def jp2_to_jpeg(stream, name)
jpeg_data = Tempfile.create([ name.to_s, ".jp2" ], binmode: true) do |f|
f.write(stream.data)
f.flush
MiniMagick::Tool::Convert.new do |cmd|
cmd << f.path
cmd << "jpeg:-"
end
end

{ io: StringIO.new(jpeg_data), filename: "#{name}.jpg", content_type: "image/jpeg" }
rescue => e
Rails.logger.warn "PDF JP2 conversion failed for #{name}: #{e.message}"
nil
end

def raw_to_png(stream, name)
width = stream.hash[:Width]
height = stream.hash[:Height]
bit_depth = stream.hash[:BitsPerComponent] || 8
colorspace = COLORSPACE_MAP[stream.hash[:ColorSpace]]

return nil unless width && height && colorspace

png_data = Tempfile.create([ name.to_s, ".raw" ], binmode: true) do |raw_file|
raw_file.write(stream.unfiltered_data)
raw_file.flush

MiniMagick::Tool::Convert.new do |cmd|
cmd.size "#{width}x#{height}"
cmd.depth bit_depth.to_s
cmd << "#{colorspace}:#{raw_file.path}"
cmd << "png:-"
end
end

{ io: StringIO.new(png_data), filename: "#{name}.png", content_type: "image/png" }
rescue => e
Rails.logger.warn "PDF raw picture conversion failed for #{name}: #{e.message}"
nil
end
end
2 changes: 2 additions & 0 deletions app/views/books/_create_buttons.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,5 @@
</svg>
<span class="for-screen-reader">Add a new section page</span>
<% end %>

<%= render "books/imports/import", book: book %>
37 changes: 37 additions & 0 deletions app/views/books/imports/_import.html.erb
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<div data-controller="dialog" data-action="keydown.esc->dialog#close">
<button type="button" data-action="click->dialog#open" class="btn btn--plain txt-medium fill-transparent disable-when-arranging disable-when-deleting" title="Import PDF" aria-label="Import PDF">
<svg viewBox="0 0 20 24" xmlns="http://www.w3.org/2000/svg" fill="var(--color-ink)">
<path d="m15.8 21.7c0 .3-.2.4-.4.4h-13.5-.2v-16.9c0-.3.2-.4.4-.4h6.3c0-.6.1-1.2.3-1.8h-6.9c-1 0-1.8.8-1.8 1.8v17.5c0 1 .8 1.8 1.8 1.8h14c.9 0 1.6-.6 1.8-1.4v-11.6c-.5.2-1.1.4-1.8.5v10.3z"/>
<path fill="var(--color-positive)" d="m15 0c-2.8 0-5 2.2-5 5s2.2 5 5 5 5-2.2 5-5-2.2-5-5-5zm.7 7.5-2 2-2-2 1-1 .4.4v-2.9h1.1v2.9l.5-.4z"/>
<path d="m4.5 14h9v1.5h-9zm0 3h6v1.5h-6z"/>
</svg>
<span class="for-screen-reader">Import PDF</span>
</button>

<dialog data-dialog-target="dialog" class="dialog panel shadow" data-controller="loading">
<form method="dialog">
<button class="btn panel__close" title="Close (esc)" data-loading-target="disable">
<%= image_tag "remove.svg", aria: { hidden: true }, size: 24 %>
<span class="for-screen-reader">Close</span>
</button>
</form>

<%= form_with url: book_import_path(book), method: :post, multipart: true, data: { turbo: false, action: "submit->loading#start" } do |form| %>
<div class="flex align-center gap">
<label class="flex align-center gap full-width">
<div class="flex align-center gap input input--actor">
<%= image_tag "file-pdf.svg", aria: { hidden: true }, size: 24, class: "colorize--black" %>
<span class="search__input txt-large">
<%= form.file_field :pdf, accept: "application/pdf", required: true, class: "input--file__input" %>
</span>
</div>

<button class="btn btn--reversed txt-medium" title="Import PDF" type="submit" data-loading-target="disable">
<%= image_tag "arrow-right.svg", aria: { hidden: true }, size: 24 %>
<span class="for-screen-reader">Import PDF</span>
</button>
</label>
</div>
<% end %>
</dialog>
</div>
15 changes: 15 additions & 0 deletions config/initializers/pdf_reader_patch.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# The pdf-reader gem uses `font_size * 0.2` to decide whether a gap between
# two text runs should become a space. For large/bold fonts this threshold is
# too high and causes word spaces to be dropped (e.g. "Table Of Contents" →
# "TableOfContents"). Patching with 0.1 keeps tight intra-word kerning gap-free
# while still inserting spaces for typical word spacing in large fonts.
PDF::Reader::TextRun.prepend(Module.new do
def +(other)
raise ArgumentError, "#{other} cannot be merged with this run" unless mergable?(other)
if (other.x - endx) < (font_size * 0.1)
self.class.new(x, y, other.endx - x, font_size, text + other.text)
else
self.class.new(x, y, other.endx - x, font_size, "#{text} #{other.text}")
end
end
end)
1 change: 1 addition & 0 deletions config/routes.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
resources :books, except: %i[ index show ] do
resource :publication, controller: "books/publications", only: %i[ show edit update ]
resource :bookmark, controller: "books/bookmarks", only: :show
resource :import, controller: "books/imports", only: %i[ create ]
Comment thread
albertski marked this conversation as resolved.

scope module: "books" do
namespace :leaves do
Expand Down
64 changes: 64 additions & 0 deletions test/controllers/books/imports_controller_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
require "test_helper"

class Books::ImportsControllerTest < ActionDispatch::IntegrationTest
setup do
sign_in :kevin
end

test "create imports leaves from a valid PDF" do
assert_difference -> { books(:handbook).leaves.active.count }, +2 do
post book_import_path(books(:handbook)), params: {
pdf: fixture_file_upload("sample.pdf", "application/pdf")
}
end

assert_redirected_to book_slug_url(books(:handbook))
assert_nil flash[:notice]
end

test "create with no file redirects with alert" do
assert_no_difference -> { Leaf.count } do
post book_import_path(books(:handbook))
end

assert_redirected_to book_slug_url(books(:handbook))
assert_equal "Could not import PDF.", flash[:alert]
end

test "create with a non-PDF file redirects with alert" do
assert_no_difference -> { Leaf.count } do
post book_import_path(books(:handbook)), params: {
pdf: fixture_file_upload("reading.webp", "image/webp")
}
end

assert_redirected_to book_slug_url(books(:handbook))
assert_equal "Could not import PDF.", flash[:alert]
end

test "create logs error when PDF parsing fails" do
log_output = StringIO.new
previous_logger = Rails.logger
Rails.logger = ActiveSupport::Logger.new(log_output)

assert_no_difference -> { Leaf.count } do
post book_import_path(books(:handbook)), params: {
pdf: fixture_file_upload("reading.webp", "image/webp")
}
end

assert_match "PdfImporter failed", log_output.string
assert_redirected_to book_slug_url(books(:handbook))
assert_equal "Could not import PDF.", flash[:alert]
ensure
Rails.logger = previous_logger
end

test "create is forbidden for non-editors" do
sign_in :jz
post book_import_path(books(:handbook)), params: {
pdf: fixture_file_upload("sample.pdf", "application/pdf")
}
assert_response :forbidden
end
end
Binary file added test/fixtures/files/blank.pdf
Binary file not shown.
Binary file added test/fixtures/files/sample.pdf
Binary file not shown.
Binary file added test/fixtures/files/sample_with_image.pdf
Binary file not shown.
Loading
Loading