Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/render.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ jobs:
- uses: r-lib/actions/setup-r-dependencies@v2
- name: Install extra LaTeX packages
run: sudo apt install texlive-latex-recommended
- name: Install required R packages
run: Rscript -e 'install.packages(c( "rmarkdown", "httr", "jsonlite", "xml2", "dplyr", "ggplot2", "lubridate", "tidyr", "stringr" ))'
- run: mkdir report/
- name: Render documents
env:
Expand All @@ -34,7 +36,7 @@ jobs:
period_begin: ${{ github.event.inputs.period_begin }}
period_end: ${{ github.event.inputs.period_end }}
run: Rscript -e 'rmarkdown::render("dataverse_metrics.Rmd", "all", output_dir="report")'
- uses: actions/upload-artifact@v3
- uses: actions/upload-artifact@v4
with:
name: dataverse-metrics
path: report/
118 changes: 80 additions & 38 deletions dataverse_metrics.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ if (file.exists("deposits.rds")) {
} else { # actually scrape
deposits <- deposits_get(dataverse_host)
saveRDS(deposits, "deposits.rds")

projects <- content(GET(paste0(dataverse, "/uniquedownloads/monthly")))
projects <- bind_rows(projects$data)
saveRDS(projects, "projects.rds")
Expand All @@ -71,6 +71,11 @@ if (file.exists("deposits.rds")) {
saveRDS(downloads, "downloads.rds")
}
}
# Always Fix inconsistent subject types
deposits$subjects <- lapply(deposits$subjects, function(x) {
if (is.null(x) || is.logical(x)) return(NA_character_)
as.character(x)
})


```
Expand Down Expand Up @@ -101,55 +106,92 @@ published_new <- deposits %>% filter(as.Date(published_at) < period_end & as.Dat
deposits %>% group_by(versionState) %>% summarize(n=n()) %>%
knitr::kable(caption="Total deposits by publication status", col.names=c("Status", "Deposits"))

# all published projects by subject
deposits_subj <- deposits %>%
filter(versionState=="Published") %>% select(c("subjects")) %>%
unnest_longer(subjects) %>% group_by(subjects) %>% summarize(published=n())

# published projects by subject within time frame
deposits_subj <- deposits %>%
filter(as.Date(published_at) < period_end & as.Date(published_at) > period_begin &
versionState=="Published") %>%
select(c("subjects")) %>% unnest_longer(subjects) %>% group_by(subjects) %>%
summarize(period=n()) %>% full_join(deposits_subj)

# Any unpublished projects?
if (any(deposits$versionState=="Unpublished")) {
deposits_subj <- deposits %>% filter(versionState=="Unpublished") %>% select(c("subjects")) %>%
unnest_longer(subjects) %>% group_by(subjects) %>%
summarize(unpublished=n()) %>% full_join(deposits_subj)

# helper to safely unnest and coerce
as_char_subjects <- function(df) {
df %>%
unnest_longer(subjects, keep_empty = TRUE) %>%
mutate(subjects = as.character(subjects)) %>%
tidyr::replace_na(list(subjects = "Unknown"))
}

# 1) Total published by subject (all time)
published_all <- deposits %>%
filter(versionState == "Published") %>%
select(subjects) %>%
as_char_subjects() %>%
group_by(subjects) %>%
summarize(published = n(), .groups = "drop")

# 2) Published in period by subject
published_period <- deposits %>%
filter(versionState == "Published",
as.Date(published_at) < period_end,
as.Date(published_at) > period_begin) %>%
select(subjects) %>%
as_char_subjects() %>%
group_by(subjects) %>%
summarize(period = n(), .groups = "drop")

# 3) Unpublished totals by subject (if any)
if (any(deposits$versionState == "Unpublished")) {
unpublished_all <- deposits %>%
filter(versionState == "Unpublished") %>%
select(subjects) %>%
as_char_subjects() %>%
group_by(subjects) %>%
summarize(unpublished = n(), .groups = "drop")
} else {
deposits_subj$unpublished <- 0
unpublished_all <- tibble(subjects = character(), unpublished = integer())
}

# Any unpublished in the time period specified?
if (any(subset(deposits, as.Date(createdAt) < period_end &
as.Date(createdAt) > period_begin)$versionState=="Unpublished")) {
deposits_subj <- deposits %>% filter(versionState == "Unpublished" &
as.Date(createdAt) < period_end &
as.Date(createdAt) > period_begin) %>%
select(c("subjects")) %>% unnest_longer(subjects) %>% group_by(subjects) %>%
summarize(unpublished_period=n()) %>% full_join(deposits_subj)
# 4) Unpublished in period by subject (if any)
if (any(subset(deposits,
as.Date(createdAt) < period_end & as.Date(createdAt) > period_begin)$versionState == "Unpublished")) {
unpublished_period <- deposits %>%
filter(versionState == "Unpublished",
as.Date(createdAt) < period_end,
as.Date(createdAt) > period_begin) %>%
select(subjects) %>%
as_char_subjects() %>%
group_by(subjects) %>%
summarize(unpublished_period = n(), .groups = "drop")
} else {
deposits_subj$unpublished_period <- 0
unpublished_period <- tibble(subjects = character(), unpublished_period = integer())
}

# exclude unpublished columns if missing dataverse key
if (dataverse_key=="") {
# Combine (all joins now match on a character 'subjects')
deposits_subj <- published_all %>%
full_join(published_period, by = "subjects") %>%
full_join(unpublished_all, by = "subjects") %>%
full_join(unpublished_period, by = "subjects") %>%
arrange(subjects)

# Columns depending on token presence
if (identical(dataverse_key, "") || is.na(dataverse_key)) {
include_columns <- c("subjects", "period", "published")
column_names <- c("Subject",
paste("Published between", period_begin, "and", period_end),
"Total published")
paste("Published between", period_begin, "and", period_end),
"Total published")
} else {
include_columns <- c("subjects", "unpublished_period",
"unpublished", "period", "published")
column_names <- c("Subject", paste("Unpublished between", period_begin, "and", period_end), "Total draft", paste("Published between", period_begin, "and", period_end), "Total published")
include_columns <- c("subjects", "unpublished_period", "unpublished", "period", "published")
column_names <- c("Subject",
paste("Unpublished between", period_begin, "and", period_end),
"Total draft",
paste("Published between", period_begin, "and", period_end),
"Total published")
}

# make the table
deposits_subj[, include_columns] %>%
knitr::kable(caption="Deposits by subject",
col.names=column_names)
tidyr::replace_na(list(
unpublished_period = 0L,
unpublished = 0L,
period = 0L,
published = 0L
)) %>%
knitr::kable(caption = "Deposits by subject",
col.names = column_names)


```

Expand Down