Skip to content

Data Processing

Data Processing #228

name: Data Processing
# This workflow is triggered daily at midnight and can also be manually triggered.
# It processes data from various scripts and uploads the processed data as an artifact.
# The data is used to update the website's content.
on:
schedule:
- cron: '0 0 * * *' # Daily at midnight
workflow_dispatch:
jobs:
process-data:
name: Process Data
runs-on: ubuntu-22.04
permissions:
contents: write
pull-requests: write
env:
PYTHON_VERSION: "3.11"
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Configure Git
run: |
git config --global user.email "mudaherarich@gmail.com"
git config --global user.name "richarddushime"
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: ${{ env.PYTHON_VERSION }}
cache: 'pip'
- name: Setup r2u
uses: eddelbuettel/github-actions/r2u-setup@master
- uses: r-lib/actions/setup-pandoc@v2
- name: Install tenzing R dependencies
run: Rscript -e 'install.packages(c("rmarkdown","ggplot2", "readxl", "dplyr", "googlesheets4", "stringr", "gridExtra", "glue", "tidygraph", "ggraph", "igraph", "visNetwork"))'
- name: Run Tenzing analysis
continue-on-error: true # Continue even if this step fails
run: |
Rscript -e "rmarkdown::render('scripts/contributor-analysis/contributor_analysis.rmd')"
- name: Move Tenzing analysis
continue-on-error: true # Continue even if this step fails
run: |
mv scripts/contributor-analysis/contributor_analysis.md content/contributor-analysis/index.md
mv scripts/contributor-analysis/*.png content/contributor-analysis/
rm -rf content/contributor-analysis/htmlwidgets_libs
mv scripts/contributor-analysis/htmlwidgets_libs content/contributor-analysis/
sed -i.bak -e '/^```{=html}$/d' -e '/^```$/d' content/contributor-analysis/index.md && rm content/contributor-analysis/index.md.bak
- name: Install Python dependencies
run: python3 -m pip install -r ./requirements.txt
- name: Run Tenzing script
continue-on-error: true # Continue even if this step fails
run: python3 scripts/forrt_contribs/tenzing.py
- name: Run Curated Resources script
continue-on-error: true # Continue even if this step fails
run: python3 content/resources/resource.py
- name: Move and validate Tenzing output
continue-on-error: true # Continue even if this step fails
run: |
mv scripts/forrt_contribs/tenzing.md content/contributors/tenzing.md
if [ ! -f content/contributors/tenzing.md ]; then
echo "tenzing.md not found"
exit 1
fi
- name: Validate curated resources
continue-on-error: true # Continue even if this step fails
run: |
for file in content/curated_resources/*; do
if [ ! -f "$file" ]; then
echo "Non-markdown file found: $file"
exit 1
fi
done
- name: Download GA Data
continue-on-error: true # Continue even if this step fails
env:
GA_API_CREDENTIALS: ${{ secrets.GA_API_CREDENTIALS }}
GA_PROPERTY_ID: ${{ secrets.GA_PROPERTY_ID }}
run: |
if [ -z "$GA_API_CREDENTIALS" ] || [ -z "$GA_PROPERTY_ID" ]; then
echo "❌ GA credentials not set"
exit 1
fi
rm -f data/ga_data.json
rm -rf data/ga_data/
python scripts/download_ga_data.py
if [ -f "data/ga_data.json" ]; then
echo "✅ GA data file created successfully"
echo "File size: $(wc -c < data/ga_data.json) bytes"
# Quick validation of data structure
python3 -c "import json; data = json.load(open('data/ga_data.json')); print('✅ GA data:', len(data.get('regions', [])), 'countries,', len(data.get('top_pages', [])), 'pages')"
else
echo "❌ GA data file was not created"
exit 1
fi
- name: Create PR for GA data update
if: github.event_name != 'pull_request'
continue-on-error: true # Continue even if this step fails
run: |
echo "=== Creating PR for GA data update ==="
# Check if it's the first day of the month OR manually triggered
CURRENT_DAY=$(date +%d)
if [ "$CURRENT_DAY" != "01" ] && [ "${{ github.event_name }}" != "workflow_dispatch" ]; then
echo "ℹ️ Skipping PR creation (not 1st of month and not manual trigger)"
exit 0
fi
BRANCH_NAME="ga-data-update-$(date +%Y%m%d-%H%M%S)"
git fetch origin master
git checkout master
# Delete local branch if it exists
git branch -D "$BRANCH_NAME" 2>/dev/null || true
git checkout -b "$BRANCH_NAME"
# Verify we're on the correct branch
CURRENT_BRANCH=$(git branch --show-current)
if [ "$CURRENT_BRANCH" != "$BRANCH_NAME" ]; then
echo "❌ Failed to create branch $BRANCH_NAME, currently on $CURRENT_BRANCH"
exit 1
fi
echo "✅ Created and switched to branch: $BRANCH_NAME"
# Add and commit the GA data file
echo "Adding GA data file..."
git add data/ga_data.json
git commit -m "Update GA data - $(date -u +'%Y-%m-%d %H:%M:%S UTC')"
if ! git push origin "$BRANCH_NAME" --force-with-lease; then
git push origin "$BRANCH_NAME"
fi
gh pr create \
--title "📊 Monthly GA Data Update - $(date '+%B %Y')" \
--body "Automated monthly Google Analytics data update. Generated on $(date -u +'%Y-%m-%d %H:%M:%S UTC'). Files changed: data/ga_data.json" \
--base master \
--head "$BRANCH_NAME" \
--label "ga-data,monthly-update"
echo "✅ PR created for GA data update"
env:
GITHUB_TOKEN: ${{ secrets.FORRT_PAT }}
GH_TOKEN: ${{ secrets.FORRT_PAT }}
- name: Run Google Scholar script
continue-on-error: true
run: python3 scripts/gs-cite/google_scholar.py
env:
SERPAPI: ${{ secrets.SERPAPI }}
- name: Upload data artifact
id: upload-artifact
uses: actions/upload-artifact@v4
with:
name: data-artifact
path: |
content/contributors/tenzing.md
content/curated_resources/
data/
content/contributor-analysis/
content/publications/citation_chart.webp
retention-days: 1