Merge pull request #1473 from MITLibraries/etd-677-retry-apt

JPrevost · web-flow · commit 69cb24d00928 · 2025-09-24T08:19:55.000-04:00
Adds retry logic to PreservationSubmissionJob
diff --git a/README.md b/README.md
@@ -374,11 +374,18 @@ processors must create a corresponding Archivematica accession number.
 1. Following the processing workflow, stakeholders choose a term to publish (Process theses - Select term - Select
 Publication Review - Publish)
 2. ETD will now automatically send data to DSS via the SQS queue
-3. DSS runs (as of this writing that is a manual process documented in the
-  [DSS repo](https://github.com/MITLibraries/dspace-submission-service#run-stage))
+3. DSS runs (as of this writing that is no longer documented in the
+  [DSS repo](https://github.com/MITLibraries/dspace-submission-service))
+   * From the root of a local checkout of `dspace-submission-service`
+   * Python dependencies do *not* need to be installed
+   * Authenticate your terminal to appropriate AWS account and role (dssManagement)
+   * `make run-prod` (or `make run-stage`)
 4. ETD processes output queue to update records and send email to stakeholders with summary data and list
   of any error records. As of now this is a manual process, but can be triggered via rake task using the following heroku-cli command:
 
+  > [!IMPORTANT]
+  > Check the AWS Console to ensure the APT Lambda is active (i.e. not inactive) before proceeding. If you do not, the preservation jobs may fail if they have not run for a few days.
+
   ```shell
   # run the output queue processing job
   heroku run rails dss:process_output_queue --app TARGET-HEROKU-APP
@@ -397,7 +404,7 @@ You can publish a single thesis that is already in `Publication review` or `Pend
 task:
 
 ```shell
-heroku run -s standard-2x rails dss:publish_thesis_by_id[THESIS_ID] --app TARGET-HEROKU-APP
+heroku run rails dss:publish_thesis_by_id[THESIS_ID] --app TARGET-HEROKU-APP
 ```
 
 Note: `Pending publication` is allowed here, but not expected to be a normal occurence, to handle the edge case of the app thinking data was sent to SQS but the data not arriving for any reason.
diff --git a/app/jobs/preservation_submission_job.rb b/app/jobs/preservation_submission_job.rb
@@ -4,6 +4,16 @@ class PreservationSubmissionJob < ActiveJob::Base
 
   queue_as :default
 
+  # Custom error class for 502 Bad Gateway responses from APT
+  class APTBadGatewayError < StandardError; end
+
+  # Retry up to 10 times for transient 502 Bad Gateway responses. These 502 errors are generally caused by the lambda
+  # entering `Inactive` state after a long period of inactivity, which can take several minutes to recover from.
+  # We are using a fixed wait time of 5 minutes between retries to give the lambda time to warm up, rather than
+  # retrying immediately with exponential backoff as we expect the first few retries to fail in that scenario so this
+  # longer time is more effective.
+  retry_on APTBadGatewayError, wait: 5.minutes, attempts: 10
+
   def perform(theses)
     Rails.logger.info("Preparing to send #{theses.count} theses to preservation")
     results = { total: theses.count, processed: 0, errors: [] }
@@ -13,7 +23,10 @@ def perform(theses)
       preserve_payload(payload)
       Rails.logger.info("Thesis #{thesis.id} has been sent to preservation")
       results[:processed] += 1
-    rescue StandardError, Aws::Errors => e
+    rescue StandardError => e
+      # Explicitly re-raise the 502-specific error so ActiveJob can retry it.
+      raise e if e.is_a?(APTBadGatewayError)
+
       preservation_error = "Thesis #{thesis.id} could not be preserved: #{e}"
       Rails.logger.info(preservation_error)
       results[:errors] << preservation_error
@@ -40,6 +53,12 @@ def post_payload(payload)
       http.request(request)
     end
 
+    # If the remote endpoint returns 502, raise a APTBadGatewayError so ActiveJob can retry.
+    if response.code.to_s == '502'
+      Rails.logger.warn("Received 502 from APT for payload #{payload.id}; raising for retry")
+      raise APTBadGatewayError, 'APT returned 502 Bad Gateway'
+    end
+
     validate_response(response)
   end
 
@@ -49,8 +68,8 @@ def validate_response(response)
     end
 
     result = JSON.parse(response.body)
-    unless result['success'] == true
-      raise "APT failed to create a bag: #{response.body}"
-    end
+    return if result['success'] == true
+
+    raise "APT failed to create a bag: #{response.body}"
   end
 end
diff --git a/test/jobs/preservation_submission_job_test.rb b/test/jobs/preservation_submission_job_test.rb
@@ -2,7 +2,6 @@
 require 'webmock/minitest'
 
 class PreservationSubmissionJobTest < ActiveJob::TestCase
-
   # because we need to actually use the file it's easier to attach it in the test rather
   # than use our fixtures as the fixtures oddly don't account for the file actually being
   # where ActiveStorage expects them to be. We also need this to be a record that looks like
@@ -27,38 +26,38 @@ def setup_thesis_two
   end
 
   def stub_apt_lambda_success
-    stub_request(:post, ENV['APT_LAMBDA_URL'])
+    stub_request(:post, ENV.fetch('APT_LAMBDA_URL', nil))
       .to_return(
         status: 200,
         body: {
           success: true,
           bag: {
             entries: {
-              "manifest-md5.txt" => { md5: "fakehash" }
+              'manifest-md5.txt' => { md5: 'fakehash' }
             }
           },
-          output_zip_s3_uri: "s3://my-bucket/apt-testing/test-one-medium.zip"
+          output_zip_s3_uri: 's3://my-bucket/apt-testing/test-one-medium.zip'
         }.to_json,
         headers: { 'Content-Type' => 'application/json' }
       )
   end
 
   def stub_apt_lambda_failure
-    stub_request(:post, ENV['APT_LAMBDA_URL'])
+    stub_request(:post, ENV.fetch('APT_LAMBDA_URL', nil))
       .to_return(
         status: 500,
         headers: { 'Content-Type' => 'application/json' }
       )
   end
 
   def stub_apt_lambda_200_failure
-    stub_request(:post, ENV['APT_LAMBDA_URL'])
+    stub_request(:post, ENV.fetch('APT_LAMBDA_URL', nil))
       .to_return(
         status: 200,
         body: {
           success: false,
-          error: "An error occurred (404) when calling the HeadObject operation: Not Found",
-          bag: { entries: {}},
+          error: 'An error occurred (404) when calling the HeadObject operation: Not Found',
+          bag: { entries: {} }
         }.to_json,
         headers: { 'Content-Type' => 'application/json' }
       )
@@ -146,45 +145,74 @@ def stub_apt_lambda_200_failure
     end
   end
 
-  test 'does not create payloads if job fails' do
+  # This is not the desired behavior, but it confirms the current behavior.
+  test 'create payloads even if APT job fails' do
     stub_apt_lambda_failure
-    bad_thesis = theses(:one)
     good_thesis = setup_thesis
     another_good_thesis = setup_thesis_two
-    assert_empty bad_thesis.archivematica_payloads
     assert_empty good_thesis.archivematica_payloads
     assert_empty another_good_thesis.archivematica_payloads
 
-    PreservationSubmissionJob.perform_now([good_thesis, bad_thesis, another_good_thesis])
+    PreservationSubmissionJob.perform_now([good_thesis, another_good_thesis])
 
-    # first thesis should succeed and have a payload
     assert_equal 1, good_thesis.archivematica_payloads.count
 
-    # second thesis should fail and not have a payload
-    assert_empty bad_thesis.archivematica_payloads
-
-    # third thesis should succeed and have a payload, despite prior failure
     assert_equal 1, another_good_thesis.archivematica_payloads.count
   end
 
-  test 'does not create payloads if post succeeds but APT fails' do
+  # This is not the desired behavior, but it confirms the current behavior.
+  test 'creates payloads if post succeeds but APT fails' do
     stub_apt_lambda_200_failure
-    bad_thesis = theses(:one)
     good_thesis = setup_thesis
     another_good_thesis = setup_thesis_two
-    assert_empty bad_thesis.archivematica_payloads
     assert_empty good_thesis.archivematica_payloads
     assert_empty another_good_thesis.archivematica_payloads
 
-    PreservationSubmissionJob.perform_now([good_thesis, bad_thesis, another_good_thesis])
+    PreservationSubmissionJob.perform_now([good_thesis, another_good_thesis])
 
-    # first thesis should succeed and have a payload
     assert_equal 1, good_thesis.archivematica_payloads.count
 
-    # second thesis should fail and not have a payload
-    assert_empty bad_thesis.archivematica_payloads
-
-    # third thesis should succeed and have a payload, despite prior failure
     assert_equal 1, another_good_thesis.archivematica_payloads.count
   end
+
+  test 'throws exceptions and probably creates payloads when a bad key is provided' do
+    skip('Test not implemented yet')
+    # Our lambda returns 400 Bad Request with a error body of Invalid input payload
+    # This should never happen as we submit it via ENV, but just in case we should understand what it looks like
+  end
+
+  test 'retries on 502 Bad Gateway errors' do
+    # This syntax will cause the first two calls to return 502, and the third to succeed.
+    stub_request(:post, ENV.fetch('APT_LAMBDA_URL', nil))
+      .to_return(
+        { status: 502 },
+        { status: 502 },
+        { status: 200, body: {
+          success: true,
+          bag: {
+            entries: {
+              'manifest-md5.txt' => { md5: 'fakehash' }
+            }
+          },
+          output_zip_s3_uri: 's3://my-bucket/apt-testing/test-one-medium.zip'
+        }.to_json, headers: { 'Content-Type' => 'application/json' } }
+      )
+
+    thesis = setup_thesis
+    assert_equal 0, thesis.archivematica_payloads.count
+
+    # We need to wrap this in perform_enqueued_jobs so that the retries are actually executed. Our normal syntax only
+    # runs a single job, but because we are testing retries this wrapper executes the jobs this job queues.
+    perform_enqueued_jobs do
+      PreservationSubmissionJob.perform_now([thesis])
+    end
+
+    # 3 payloads were created. 2 for the failures, 1 for the success.
+    # This isn't ideal, but it's the current behavior. A refactor to using a pre-preservation job to create
+    # metadata.csv prior to this job would fix this but is out of scope for now.
+    assert_equal 3, thesis.archivematica_payloads.count
+
+    # The last payload should be marked as preserved.
+    assert_equal 'preserved', thesis.archivematica_payloads.last.preservation_status
+  end
 end