Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 2 additions & 126 deletions .github/workflows/metadata-catalog.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,132 +59,8 @@ jobs:
cat dist/index.json | jq .
echo "Partition files:"
jq -r '.partitions[].path' dist/index.json
# Continue-on-error allows subsequent steps to run even if this one fails.
# We need this so the artifact upload step can execute on failure.
continue-on-error: true

- name: Upload validation report artifact
# This step runs on both success and failure
if: always()
uses: actions/upload-artifact@v4
with:
name: validation-report
path: dist/index.json

- name: Check for validation failure
# This step explicitly fails the job if the generation step failed.
if: steps.generate-metadata.outcome == 'failure'
run: |
echo "Metadata generation failed. See the 'validation-report' artifact for details."
exit 1

- name: Upload metadata to S3 (sync)
run: |
aws s3 sync dist/metadata/ ${{ env.S3_BASE_PATH }}/

- name: Repair / add partitions
run: |
aws athena start-query-execution \
--query-string "MSCK REPAIR TABLE ${TABLE_NAME}" \
--query-execution-context Database=${ATHENA_DATABASE} \
--result-configuration OutputLocation=${ATHENA_RESULT_BUCKET}

- name: Verify partition count
run: |
QID=$(aws athena start-query-execution \
--query-string "SHOW PARTITIONS ${TABLE_NAME}" \
--query-execution-context Database=${ATHENA_DATABASE} \
--result-configuration OutputLocation=${ATHENA_RESULT_BUCKET} \
--query-execution-context Database=${ATHENA_DATABASE} | jq -r '.QueryExecutionId')
echo "QueryExecutionId=$QID";
# Poll for completion
STATUS="PENDING"
RESULT_JSON=""
for i in $(seq 1 30); do
RESULT_JSON=$(aws athena get-query-execution --query-execution-id $QID)
STATUS=$(echo "$RESULT_JSON" | jq -r '.QueryExecution.Status.State');
if [ "$STATUS" = "SUCCEEDED" ]; then break; fi
if [ "$STATUS" = "FAILED" ] || [ "$STATUS" = "CANCELLED" ]; then
echo "::error::Athena query $QID ended with status $STATUS" >&2
echo "$RESULT_JSON" | jq -r '.QueryExecution.Status.StateChangeReason' >&2
exit 1
fi
sleep 4
done
if [ "$STATUS" != "SUCCEEDED" ]; then
echo "::error::Athena query $QID did not succeed (status=$STATUS)" >&2
exit 1
fi
OUTPUT_URI=$(echo "$RESULT_JSON" | jq -r '.QueryExecution.ResultConfiguration.OutputLocation')
if [ -z "$OUTPUT_URI" ] || [ "$OUTPUT_URI" = "null" ]; then
echo "::error::Athena did not return an output location for query $QID" >&2
exit 1
fi
echo "OutputLocation=$OUTPUT_URI"
DOWNLOAD_OK="false"
for attempt in $(seq 1 5); do
if aws s3 cp "$OUTPUT_URI" partitions.csv; then
DOWNLOAD_OK="true"
break
fi
echo "Waiting for result file to appear... (attempt $attempt)"
sleep 2
done
if [ "$DOWNLOAD_OK" != "true" ]; then
echo "::error::Failed to download Athena results from $OUTPUT_URI" >&2
exit 1
fi
echo "Partitions:"; cat partitions.csv

- name: Sample query (count rows)
run: |
QID=$(aws athena start-query-execution \
--query-string "SELECT count(*) FROM ${TABLE_NAME}" \
--query-execution-context Database=${ATHENA_DATABASE} \
--result-configuration OutputLocation=${ATHENA_RESULT_BUCKET} | jq -r '.QueryExecutionId')
STATUS="PENDING"
RESULT_JSON=""
for i in $(seq 1 30); do
RESULT_JSON=$(aws athena get-query-execution --query-execution-id $QID)
STATUS=$(echo "$RESULT_JSON" | jq -r '.QueryExecution.Status.State');
if [ "$STATUS" = "SUCCEEDED" ]; then break; fi
if [ "$STATUS" = "FAILED" ] || [ "$STATUS" = "CANCELLED" ]; then
echo "::error::Athena query $QID ended with status $STATUS" >&2
echo "$RESULT_JSON" | jq -r '.QueryExecution.Status.StateChangeReason' >&2
exit 1
fi
sleep 4
done
if [ "$STATUS" != "SUCCEEDED" ]; then
echo "::error::Athena query $QID did not succeed (status=$STATUS)" >&2
exit 1
fi
OUTPUT_URI=$(echo "$RESULT_JSON" | jq -r '.QueryExecution.ResultConfiguration.OutputLocation')
if [ -z "$OUTPUT_URI" ] || [ "$OUTPUT_URI" = "null" ]; then
echo "::error::Athena did not return an output location for query $QID" >&2
exit 1
fi
echo "OutputLocation=$OUTPUT_URI"
DOWNLOAD_OK="false"
for attempt in $(seq 1 5); do
if aws s3 cp "$OUTPUT_URI" rowcount.csv; then
DOWNLOAD_OK="true"
break
fi
echo "Waiting for result file to appear... (attempt $attempt)"
sleep 2
done
if [ "$DOWNLOAD_OK" != "true" ]; then
echo "::error::Failed to download Athena results from $OUTPUT_URI" >&2
exit 1
fi
echo "Row count:"; cat rowcount.csv

- name: Post-run summary
- name: Upload and process metadata
run: |
echo "Metadata catalog workflow completed.";
echo "Table: ${TABLE_NAME}";
echo "S3 Base Path: ${S3_BASE_PATH}";
echo "Database: ${ATHENA_DATABASE}";
uv run publish-metadata-to-s3 --root .

# Optional: simple failure notification step could be added here.
2 changes: 2 additions & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[MESSAGES CONTROL]
disable=logging-fstring-interpolation
Loading