install.packages("healthyverse")
library(healthyverse)
A Review of 2024
2024 A Year in Review
The year 2024 was a big year for me. I did a lot of coding, a lot more than I typically do. The biggest push came personally in my ongoing development of my R packages that are in the healthyverse. To use the healthyverse simply do so in the familiar fashion:
Here are links to all of the packages:
In order to start looking at some of the data that pertains to 2024 lets first get the data from the CRAN logs. Since I do this daily already, I can simply use the rds file I already have. I am going to go through the motions though, in case others might want to do something similar. The functions I am using to get the data can be found here
Now lets get that data!
library(tidyverse)
library(lubridate)
source("01_scripts/get_data_functions.R")
source("01_scripts/data_manipulation_functions.R")
source("01_scripts/mapping_functions.R")
get_cran_data()
get_package_release_data()
csv_to_rds()
Ok now that we have our data, lets ensure that we are only using the year 2024. We can do this by filtering out data by time with the timetk package.
Now lets filter our data below, some pre-processing may need to take place.
library(timetk)
<- downloads_processed_tbl() %>%
data_tbl filter_by_time(
.date_var = date,
.start_date = "2024",
.end_date = "2024"
)
glimpse(data_tbl)
Rows: 39,040
Columns: 11
$ date <date> 2024-01-01, 2024-01-01, 2024-01-01, 2024-01-01, 2024-01-01,…
$ time <Period> 15H 9M 37S, 1M 5S, 1M 5S, 1M 19S, 1M 19S, 7H 40M 59S, 5H …
$ date_time <dttm> 2024-01-01 15:09:37, 2024-01-01 00:01:05, 2024-01-01 00:01:…
$ size <int> 2116406, 2116405, 532, 529, 83289, 253032, 2367712, 54319, 2…
$ r_version <chr> "4.1.2", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "4.3.1", "4…
$ r_arch <chr> "x86_64", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "aarch64",…
$ r_os <chr> "mingw32", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "darwin20…
$ package <chr> "healthyR.ts", "healthyR.ts", "healthyR.ts", "tidyAML", "tid…
$ version <chr> "0.3.0", "0.3.0", "0.3.0", "0.0.3", "0.0.3", "0.0.12", "0.2.…
$ country <chr> NA, "JP", "JP", "JP", "JP", "US", "KR", "KR", "KR", "KR", "U…
$ ip_id <int> 2, 6074, 6074, 6074, 6074, 3446, 11, 11, 11, 11, 9833, 2, 22…
Now that we have our data, we have it for the year 2024 only with a start date of 2024-01-01 and an end date of 2024-12-29.
Package Information
The first thing we will do is look at how many downloads there were for each pacakge and it’s version.
library(knitr)
%>%
data_tbl count(package, version) %>%
pivot_wider(
id_cols = version
names_from = package
, values_from = n
, values_fill = 0
, %>%
) arrange(version) %>%
kable()
version | RandomWalker | TidyDensity | healthyR | healthyR.ai | healthyR.data | healthyR.ts | healthyverse | tidyAML |
---|---|---|---|---|---|---|---|---|
0.0.1 | 0 | 122 | 0 | 127 | 0 | 0 | 0 | 117 |
0.0.10 | 0 | 0 | 0 | 126 | 0 | 0 | 0 | 0 |
0.0.11 | 0 | 0 | 0 | 129 | 0 | 0 | 0 | 0 |
0.0.12 | 0 | 0 | 0 | 130 | 0 | 0 | 0 | 0 |
0.0.13 | 0 | 0 | 0 | 2050 | 0 | 0 | 0 | 0 |
0.0.13.tar.gz%20H | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 0 |
0.0.2 | 0 | 0 | 0 | 126 | 0 | 0 | 0 | 183 |
0.0.3 | 0 | 0 | 0 | 127 | 0 | 0 | 0 | 213 |
0.0.4 | 0 | 0 | 0 | 118 | 0 | 0 | 0 | 941 |
0.0.5 | 0 | 0 | 0 | 128 | 0 | 0 | 0 | 2236 |
0.0.6 | 0 | 0 | 0 | 845 | 0 | 0 | 0 | 0 |
0.0.7 | 0 | 0 | 0 | 128 | 0 | 0 | 0 | 0 |
0.0.8 | 0 | 0 | 0 | 127 | 0 | 0 | 0 | 0 |
0.0.9 | 0 | 0 | 0 | 138 | 0 | 0 | 0 | 0 |
0.1.0 | 389 | 0 | 126 | 828 | 0 | 130 | 0 | 0 |
0.1.1 | 0 | 0 | 127 | 0 | 0 | 145 | 0 | 0 |
0.1.2 | 0 | 0 | 144 | 0 | 0 | 130 | 0 | 0 |
0.1.3 | 0 | 0 | 127 | 0 | 0 | 129 | 0 | 0 |
0.1.4 | 0 | 0 | 127 | 0 | 0 | 133 | 0 | 0 |
0.1.5 | 0 | 0 | 123 | 0 | 0 | 129 | 0 | 0 |
0.1.6 | 0 | 0 | 128 | 0 | 0 | 123 | 0 | 0 |
0.1.7 | 0 | 0 | 118 | 0 | 0 | 131 | 0 | 0 |
0.1.8 | 0 | 0 | 586 | 0 | 0 | 589 | 0 | 0 |
0.1.9 | 0 | 0 | 127 | 0 | 0 | 134 | 0 | 0 |
0.2.0 | 1106 | 0 | 138 | 0 | 0 | 137 | 0 | 0 |
0.2.1 | 0 | 0 | 2086 | 0 | 0 | 134 | 0 | 0 |
0.2.1.tar.gz%20HT | 0 | 0 | 5 | 0 | 0 | 0 | 0 | 0 |
0.2.10 | 0 | 0 | 0 | 0 | 0 | 126 | 0 | 0 |
0.2.11 | 0 | 0 | 0 | 0 | 0 | 135 | 0 | 0 |
0.2.2 | 0 | 0 | 1564 | 0 | 0 | 124 | 0 | 0 |
0.2.2.tar.gz%20 | 0 | 0 | 0 | 0 | 0 | 10 | 0 | 0 |
0.2.3 | 0 | 0 | 0 | 0 | 0 | 131 | 0 | 0 |
0.2.4 | 0 | 0 | 0 | 0 | 0 | 134 | 0 | 0 |
0.2.5 | 0 | 0 | 0 | 0 | 0 | 134 | 0 | 0 |
0.2.6 | 0 | 0 | 0 | 0 | 0 | 134 | 0 | 0 |
0.2.7 | 0 | 0 | 0 | 0 | 0 | 132 | 0 | 0 |
0.2.8 | 0 | 0 | 0 | 0 | 0 | 957 | 0 | 0 |
0.2.9 | 0 | 0 | 0 | 0 | 0 | 144 | 0 | 0 |
0.3.0 | 0 | 0 | 0 | 0 | 0 | 2465 | 0 | 0 |
0.3.0.tar.gz%20H | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 0 |
0.3.1 | 0 | 0 | 0 | 0 | 0 | 683 | 0 | 0 |
1.0.0 | 0 | 122 | 0 | 0 | 133 | 0 | 142 | 0 |
1.0.1 | 0 | 575 | 0 | 0 | 666 | 0 | 126 | 0 |
1.0.2 | 0 | 0 | 0 | 0 | 534 | 0 | 193 | 0 |
1.0.3 | 0 | 0 | 0 | 0 | 1541 | 0 | 128 | 0 |
1.0.4 | 0 | 0 | 0 | 0 | 0 | 0 | 1901 | 0 |
1.1.0 | 0 | 122 | 0 | 0 | 571 | 0 | 680 | 0 |
1.1.1 | 0 | 0 | 0 | 0 | 1107 | 0 | 0 | 0 |
1.2.0 | 0 | 124 | 0 | 0 | 0 | 0 | 0 | 0 |
1.2.1 | 0 | 124 | 0 | 0 | 0 | 0 | 0 | 0 |
1.2.2 | 0 | 113 | 0 | 0 | 0 | 0 | 0 | 0 |
1.2.3 | 0 | 122 | 0 | 0 | 0 | 0 | 0 | 0 |
1.2.4 | 0 | 562 | 0 | 0 | 0 | 0 | 0 | 0 |
1.2.5 | 0 | 123 | 0 | 0 | 0 | 0 | 0 | 0 |
1.2.6 | 0 | 271 | 0 | 0 | 0 | 0 | 0 | 0 |
1.3.0 | 0 | 1784 | 0 | 0 | 0 | 0 | 0 | 0 |
1.4.0 | 0 | 1225 | 0 | 0 | 0 | 0 | 0 | 0 |
1.4.0.tar.gz%20H | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
1.5.0 | 0 | 2724 | 0 | 0 | 0 | 0 | 0 | 0 |
1.5.0.tar.gz%20HT | 0 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
Now lets see how many total downloads for the year there were for each package.
%>%
data_tbl count(package) %>%
set_names("Package","Total Downloads") %>%
kable()
Package | Total Downloads |
---|---|
RandomWalker | 1495 |
TidyDensity | 8117 |
healthyR | 5526 |
healthyR.ai | 5132 |
healthyR.data | 4552 |
healthyR.ts | 7358 |
healthyverse | 3170 |
tidyAML | 3690 |
%>%
data_tbl select(package, version) %>%
group_by(package) %>%
distinct() %>%
mutate(release_count = n()) %>%
ungroup() %>%
select(package, release_count) %>%
distinct() %>%
set_names("Package", "Number of Releases") %>%
kable()
Package | Number of Releases |
---|---|
healthyR.ts | 26 |
tidyAML | 5 |
healthyR.ai | 15 |
healthyR | 14 |
healthyverse | 6 |
TidyDensity | 16 |
healthyR.data | 6 |
RandomWalker | 2 |
<- data_tbl %>%
total_number_of_releases select(package, version) %>%
group_by(package) %>%
distinct() %>%
mutate(release_count = n()) %>%
ungroup() %>%
select(package, release_count) %>%
distinct() %>%
summarise(total = sum(release_count, na.rm = TRUE))
So all in all there was a total of 39,040 downloads of all the healthyverse
packages in 2024. There were in total 90 package releases as well.
Graphs
Now lets graph the data out!
%>%
data_tbl count(package, version) %>%
ggplot(aes(x = version, y = n, alpha = 0.382)) +
geom_col(aes(group = package, fill = package)) +
facet_wrap(package ~., ncol = 2, scales = "free") +
scale_y_continuous(labels = scales::label_number(big.mark = ",")) +
theme_minimal() +
theme(legend.position = "bottom") +
labs(
title = "Downloads by Package for 2024",
subtitle = "Faceted by Package",
x = "Version",
y = "Downloads",
fill = "Package"
)
%>%
data_tbl count(package, version) %>%
group_by(package) %>%
mutate(cumulative_downloads = cumsum(n)) %>%
mutate(record = row_number()) %>%
ungroup() %>%
ggplot(aes(x = record, y = cumulative_downloads, alpha = 0.382)) +
geom_col(aes(group = package, fill = package)) +
facet_wrap(package ~., ncol = 2, scales = "free") +
scale_y_continuous(labels = scales::label_number(big.mark = ",")) +
theme_minimal() +
theme(legend.position = "bottom") +
labs(
title = "Downloads by Package for 2024",
subtitle = "Faceted by Package",
x = "Relase Number",
y = "Downloads",
fill = "Package"
)
%>%
data_tbl count(package, version) %>%
group_by(package) %>%
mutate(cumulative_downloads = cumsum(n)) %>%
mutate(record = row_number()) %>%
ungroup() %>%
ggplot(aes(x = record, y = cumulative_downloads, alpha = 0.382)) +
geom_line(aes(color = package, group = package), size = 1) +
scale_y_continuous(labels = scales::label_number(big.mark = ",")) +
theme_minimal() +
theme(legend.position = "bottom") +
labs(
title = "Cumulative Downloads by Package for 2024",
subtitle = "Colored by Package",
x = "Release Number",
y = "Downloads",
color = "Package"
)
Time Series Graphs
Now lets get some time-series graphs.
library(healthyR.ts)
<- readRDS("00_data/pkg_release_tbl.rds")
pkg_tbl
%>%
data_tbl summarise_by_time(.date_var = date, n = n()) %>%
ts_calendar_heatmap_plot(.date_col = date, .value_col = n, .interactive = FALSE)
%>%
data_tbl ts_downloads_tbl(.by_time = "day", package) %>%
ggplot(aes(date, log1p(value))) +
geom_point(aes(group = package, color = package), size = 1) +
ggtitle(paste("Package Downloads: {healthyverse}")) +
geom_smooth(method = "loess", color = "black", se = FALSE) +
geom_vline(
data = pkg_tbl
aes(xintercept = as.Date(date))
, color = "red"
, lwd = 1
, lty = "solid"
, +
) facet_wrap(package ~., ncol = 2, scales = "free_x") +
theme_minimal() +
labs(
subtitle = "Vertical lines represent release dates",
x = "Date",
y = "log1p(Counts)",
color = "Package"
+
) theme(legend.position = "bottom")
%>%
data_tbl ts_downloads_tbl(.by_time = "day") %>%
rename(Actual = value) %>%
tk_augment_differences(.value = Actual, .differences = 1) %>%
tk_augment_differences(.value = Actual, .differences = 2) %>%
rename(velocity = contains("_diff1")) %>%
rename(acceleration = contains("_diff2")) %>%
pivot_longer(-date) %>%
mutate(name = str_to_title(name)) %>%
mutate(name = as_factor(name)) %>%
ggplot(aes(x = date, y = log1p(value), group = name)) +
geom_point(alpha = .2) +
geom_vline(
data = pkg_tbl
aes(xintercept = as.Date(date), color = package)
, lwd = 1
, lty = "solid"
, +
) facet_wrap(name ~ ., ncol = 1, scale = "free") +
theme_minimal() +
labs(
title = "Total Downloads: Trend, Velocity, and Accelertion",
subtitle = "Vertical Lines Indicate a CRAN Release date for a package.",
x = "Date",
y = "",
color = ""
+
) theme(legend.position = "bottom")
%>%
data_tbl ts_downloads_tbl(.by_time = "day") %>%
plot_seasonal_diagnostics(
.date_var = date,
.value = log1p(value),
.interactive = FALSE
+
) theme_minimal() +
labs(
title = "Seasonal Diagnostics",
subtitle = "Values are log1p"
)
%>%
data_tbl ts_downloads_tbl(.by_time = "day") %>%
plot_stl_diagnostics(
.date_var = date,
.value = log1p(value),
.interactive = FALSE
+
) theme_minimal() +
labs(
title = "STL Diagnostics",
subtitle = "Values are log1p"
)
Mapping
So now that we have seen all the downloads in variaous ways, where did they all come from? Lets take a look.
library(tmaptools)
library(countrycode)
library(mapview)
library(htmlwidgets)
library(webshot)
# mapping_dataset(.data_year = "2024") %>%
# head() %>%
# knitr::kable()
<- map_leaflet()
l saveWidget(l, "downloads_map.html")
try(webshot("downloads_map.html", file = "map.png", cliprect = "viewport"))
There was a total of 147 different countries that downloaded healthyverse
packages in 2024.