Data Processing #228
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Data Processing | |
| # This workflow is triggered daily at midnight and can also be manually triggered. | |
| # It processes data from various scripts and uploads the processed data as an artifact. | |
| # The data is used to update the website's content. | |
| on: | |
| schedule: | |
| - cron: '0 0 * * *' # Daily at midnight | |
| workflow_dispatch: | |
| jobs: | |
| process-data: | |
| name: Process Data | |
| runs-on: ubuntu-22.04 | |
| permissions: | |
| contents: write | |
| pull-requests: write | |
| env: | |
| PYTHON_VERSION: "3.11" | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Configure Git | |
| run: | | |
| git config --global user.email "mudaherarich@gmail.com" | |
| git config --global user.name "richarddushime" | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: ${{ env.PYTHON_VERSION }} | |
| cache: 'pip' | |
| - name: Setup r2u | |
| uses: eddelbuettel/github-actions/r2u-setup@master | |
| - uses: r-lib/actions/setup-pandoc@v2 | |
| - name: Install tenzing R dependencies | |
| run: Rscript -e 'install.packages(c("rmarkdown","ggplot2", "readxl", "dplyr", "googlesheets4", "stringr", "gridExtra", "glue", "tidygraph", "ggraph", "igraph", "visNetwork"))' | |
| - name: Run Tenzing analysis | |
| continue-on-error: true # Continue even if this step fails | |
| run: | | |
| Rscript -e "rmarkdown::render('scripts/contributor-analysis/contributor_analysis.rmd')" | |
| - name: Move Tenzing analysis | |
| continue-on-error: true # Continue even if this step fails | |
| run: | | |
| mv scripts/contributor-analysis/contributor_analysis.md content/contributor-analysis/index.md | |
| mv scripts/contributor-analysis/*.png content/contributor-analysis/ | |
| rm -rf content/contributor-analysis/htmlwidgets_libs | |
| mv scripts/contributor-analysis/htmlwidgets_libs content/contributor-analysis/ | |
| sed -i.bak -e '/^```{=html}$/d' -e '/^```$/d' content/contributor-analysis/index.md && rm content/contributor-analysis/index.md.bak | |
| - name: Install Python dependencies | |
| run: python3 -m pip install -r ./requirements.txt | |
| - name: Run Tenzing script | |
| continue-on-error: true # Continue even if this step fails | |
| run: python3 scripts/forrt_contribs/tenzing.py | |
| - name: Run Curated Resources script | |
| continue-on-error: true # Continue even if this step fails | |
| run: python3 content/resources/resource.py | |
| - name: Move and validate Tenzing output | |
| continue-on-error: true # Continue even if this step fails | |
| run: | | |
| mv scripts/forrt_contribs/tenzing.md content/contributors/tenzing.md | |
| if [ ! -f content/contributors/tenzing.md ]; then | |
| echo "tenzing.md not found" | |
| exit 1 | |
| fi | |
| - name: Validate curated resources | |
| continue-on-error: true # Continue even if this step fails | |
| run: | | |
| for file in content/curated_resources/*; do | |
| if [ ! -f "$file" ]; then | |
| echo "Non-markdown file found: $file" | |
| exit 1 | |
| fi | |
| done | |
| - name: Download GA Data | |
| continue-on-error: true # Continue even if this step fails | |
| env: | |
| GA_API_CREDENTIALS: ${{ secrets.GA_API_CREDENTIALS }} | |
| GA_PROPERTY_ID: ${{ secrets.GA_PROPERTY_ID }} | |
| run: | | |
| if [ -z "$GA_API_CREDENTIALS" ] || [ -z "$GA_PROPERTY_ID" ]; then | |
| echo "❌ GA credentials not set" | |
| exit 1 | |
| fi | |
| rm -f data/ga_data.json | |
| rm -rf data/ga_data/ | |
| python scripts/download_ga_data.py | |
| if [ -f "data/ga_data.json" ]; then | |
| echo "✅ GA data file created successfully" | |
| echo "File size: $(wc -c < data/ga_data.json) bytes" | |
| # Quick validation of data structure | |
| python3 -c "import json; data = json.load(open('data/ga_data.json')); print('✅ GA data:', len(data.get('regions', [])), 'countries,', len(data.get('top_pages', [])), 'pages')" | |
| else | |
| echo "❌ GA data file was not created" | |
| exit 1 | |
| fi | |
| - name: Create PR for GA data update | |
| if: github.event_name != 'pull_request' | |
| continue-on-error: true # Continue even if this step fails | |
| run: | | |
| echo "=== Creating PR for GA data update ===" | |
| # Check if it's the first day of the month OR manually triggered | |
| CURRENT_DAY=$(date +%d) | |
| if [ "$CURRENT_DAY" != "01" ] && [ "${{ github.event_name }}" != "workflow_dispatch" ]; then | |
| echo "ℹ️ Skipping PR creation (not 1st of month and not manual trigger)" | |
| exit 0 | |
| fi | |
| BRANCH_NAME="ga-data-update-$(date +%Y%m%d-%H%M%S)" | |
| git fetch origin master | |
| git checkout master | |
| # Delete local branch if it exists | |
| git branch -D "$BRANCH_NAME" 2>/dev/null || true | |
| git checkout -b "$BRANCH_NAME" | |
| # Verify we're on the correct branch | |
| CURRENT_BRANCH=$(git branch --show-current) | |
| if [ "$CURRENT_BRANCH" != "$BRANCH_NAME" ]; then | |
| echo "❌ Failed to create branch $BRANCH_NAME, currently on $CURRENT_BRANCH" | |
| exit 1 | |
| fi | |
| echo "✅ Created and switched to branch: $BRANCH_NAME" | |
| # Add and commit the GA data file | |
| echo "Adding GA data file..." | |
| git add data/ga_data.json | |
| git commit -m "Update GA data - $(date -u +'%Y-%m-%d %H:%M:%S UTC')" | |
| if ! git push origin "$BRANCH_NAME" --force-with-lease; then | |
| git push origin "$BRANCH_NAME" | |
| fi | |
| gh pr create \ | |
| --title "📊 Monthly GA Data Update - $(date '+%B %Y')" \ | |
| --body "Automated monthly Google Analytics data update. Generated on $(date -u +'%Y-%m-%d %H:%M:%S UTC'). Files changed: data/ga_data.json" \ | |
| --base master \ | |
| --head "$BRANCH_NAME" \ | |
| --label "ga-data,monthly-update" | |
| echo "✅ PR created for GA data update" | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.FORRT_PAT }} | |
| GH_TOKEN: ${{ secrets.FORRT_PAT }} | |
| - name: Run Google Scholar script | |
| continue-on-error: true | |
| run: python3 scripts/gs-cite/google_scholar.py | |
| env: | |
| SERPAPI: ${{ secrets.SERPAPI }} | |
| - name: Upload data artifact | |
| id: upload-artifact | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: data-artifact | |
| path: | | |
| content/contributors/tenzing.md | |
| content/curated_resources/ | |
| data/ | |
| content/contributor-analysis/ | |
| content/publications/citation_chart.webp | |
| retention-days: 1 |