diff --git a/.github/workflows/render.yml b/.github/workflows/render.yml index a87dc47..edd748e 100644 --- a/.github/workflows/render.yml +++ b/.github/workflows/render.yml @@ -26,6 +26,8 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 - name: Install extra LaTeX packages run: sudo apt install texlive-latex-recommended + - name: Install required R packages + run: Rscript -e 'install.packages(c( "rmarkdown", "httr", "jsonlite", "xml2", "dplyr", "ggplot2", "lubridate", "tidyr", "stringr" ))' - run: mkdir report/ - name: Render documents env: @@ -34,7 +36,7 @@ jobs: period_begin: ${{ github.event.inputs.period_begin }} period_end: ${{ github.event.inputs.period_end }} run: Rscript -e 'rmarkdown::render("dataverse_metrics.Rmd", "all", output_dir="report")' - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: dataverse-metrics path: report/ diff --git a/dataverse_metrics.Rmd b/dataverse_metrics.Rmd index efbc226..21a5454 100644 --- a/dataverse_metrics.Rmd +++ b/dataverse_metrics.Rmd @@ -55,7 +55,7 @@ if (file.exists("deposits.rds")) { } else { # actually scrape deposits <- deposits_get(dataverse_host) saveRDS(deposits, "deposits.rds") - + projects <- content(GET(paste0(dataverse, "/uniquedownloads/monthly"))) projects <- bind_rows(projects$data) saveRDS(projects, "projects.rds") @@ -71,6 +71,11 @@ if (file.exists("deposits.rds")) { saveRDS(downloads, "downloads.rds") } } +# Always Fix inconsistent subject types +deposits$subjects <- lapply(deposits$subjects, function(x) { + if (is.null(x) || is.logical(x)) return(NA_character_) + as.character(x) + }) ``` @@ -101,55 +106,92 @@ published_new <- deposits %>% filter(as.Date(published_at) < period_end & as.Dat deposits %>% group_by(versionState) %>% summarize(n=n()) %>% knitr::kable(caption="Total deposits by publication status", col.names=c("Status", "Deposits")) -# all published projects by subject -deposits_subj <- deposits %>% - filter(versionState=="Published") %>% select(c("subjects")) %>% - unnest_longer(subjects) %>% group_by(subjects) %>% summarize(published=n()) - -# published projects by subject within time frame -deposits_subj <- deposits %>% - filter(as.Date(published_at) < period_end & as.Date(published_at) > period_begin & - versionState=="Published") %>% - select(c("subjects")) %>% unnest_longer(subjects) %>% group_by(subjects) %>% - summarize(period=n()) %>% full_join(deposits_subj) - -# Any unpublished projects? -if (any(deposits$versionState=="Unpublished")) { - deposits_subj <- deposits %>% filter(versionState=="Unpublished") %>% select(c("subjects")) %>% - unnest_longer(subjects) %>% group_by(subjects) %>% - summarize(unpublished=n()) %>% full_join(deposits_subj) + +# helper to safely unnest and coerce +as_char_subjects <- function(df) { + df %>% + unnest_longer(subjects, keep_empty = TRUE) %>% + mutate(subjects = as.character(subjects)) %>% + tidyr::replace_na(list(subjects = "Unknown")) +} + +# 1) Total published by subject (all time) +published_all <- deposits %>% + filter(versionState == "Published") %>% + select(subjects) %>% + as_char_subjects() %>% + group_by(subjects) %>% + summarize(published = n(), .groups = "drop") + +# 2) Published in period by subject +published_period <- deposits %>% + filter(versionState == "Published", + as.Date(published_at) < period_end, + as.Date(published_at) > period_begin) %>% + select(subjects) %>% + as_char_subjects() %>% + group_by(subjects) %>% + summarize(period = n(), .groups = "drop") + +# 3) Unpublished totals by subject (if any) +if (any(deposits$versionState == "Unpublished")) { + unpublished_all <- deposits %>% + filter(versionState == "Unpublished") %>% + select(subjects) %>% + as_char_subjects() %>% + group_by(subjects) %>% + summarize(unpublished = n(), .groups = "drop") } else { - deposits_subj$unpublished <- 0 + unpublished_all <- tibble(subjects = character(), unpublished = integer()) } -# Any unpublished in the time period specified? -if (any(subset(deposits, as.Date(createdAt) < period_end & - as.Date(createdAt) > period_begin)$versionState=="Unpublished")) { - deposits_subj <- deposits %>% filter(versionState == "Unpublished" & - as.Date(createdAt) < period_end & - as.Date(createdAt) > period_begin) %>% - select(c("subjects")) %>% unnest_longer(subjects) %>% group_by(subjects) %>% - summarize(unpublished_period=n()) %>% full_join(deposits_subj) +# 4) Unpublished in period by subject (if any) +if (any(subset(deposits, + as.Date(createdAt) < period_end & as.Date(createdAt) > period_begin)$versionState == "Unpublished")) { + unpublished_period <- deposits %>% + filter(versionState == "Unpublished", + as.Date(createdAt) < period_end, + as.Date(createdAt) > period_begin) %>% + select(subjects) %>% + as_char_subjects() %>% + group_by(subjects) %>% + summarize(unpublished_period = n(), .groups = "drop") } else { - deposits_subj$unpublished_period <- 0 + unpublished_period <- tibble(subjects = character(), unpublished_period = integer()) } -# exclude unpublished columns if missing dataverse key -if (dataverse_key=="") { +# Combine (all joins now match on a character 'subjects') +deposits_subj <- published_all %>% + full_join(published_period, by = "subjects") %>% + full_join(unpublished_all, by = "subjects") %>% + full_join(unpublished_period, by = "subjects") %>% + arrange(subjects) + +# Columns depending on token presence +if (identical(dataverse_key, "") || is.na(dataverse_key)) { include_columns <- c("subjects", "period", "published") column_names <- c("Subject", - paste("Published between", period_begin, "and", period_end), - "Total published") + paste("Published between", period_begin, "and", period_end), + "Total published") } else { - include_columns <- c("subjects", "unpublished_period", - "unpublished", "period", "published") - column_names <- c("Subject", paste("Unpublished between", period_begin, "and", period_end), "Total draft", paste("Published between", period_begin, "and", period_end), "Total published") + include_columns <- c("subjects", "unpublished_period", "unpublished", "period", "published") + column_names <- c("Subject", + paste("Unpublished between", period_begin, "and", period_end), + "Total draft", + paste("Published between", period_begin, "and", period_end), + "Total published") } -# make the table deposits_subj[, include_columns] %>% - knitr::kable(caption="Deposits by subject", - col.names=column_names) + tidyr::replace_na(list( + unpublished_period = 0L, + unpublished = 0L, + period = 0L, + published = 0L + )) %>% + knitr::kable(caption = "Deposits by subject", + col.names = column_names) + ```