library(data.table)
library(tidyr)
library(stats)
library(purrr)
<- function(num_sims, n, pr) {
new_func
# Create a data.table with one row per simulation
<- data.table(sim_number = factor(seq(1, num_sims, 1)))
sim_data
# Group the data by sim_number and add columns for x and y
`:=` (
sim_data[, x = list(1:n),
y = list(stats::rbinom(n = n, size = 1, prob = pr))
= sim_number]
), by
# Compute the density of the y values and add columns for dx and dy
`:=` (
sim_data[, d = list(density(unlist(y), n = n)[c("x", "y")] |>
set_names("dx", "dy") |>
as_tibble())
= sim_number]
), by
# Compute the p-values for the y values and add a column for p
`:=` (
sim_data[, p = list(stats::pbinom(unlist(y), size = 1, prob = pr))
= sim_number]
), by
# Compute the q-values for the p-values and add a column for q
`:=` (
sim_data[, q = list(stats::qbinom(unlist(p), size = 1, prob = pr))
= sim_number]
), by
# Unnest the columns for x, y, d, p, and q
<- sim_data[,
sim_data unnest(
.SD, cols = c("x", "y", "d", "p", "q")
), = sim_number]
by
# Remove the grouping
:= as.factor(sim_number)]
sim_data[, sim_number
return(sim_data)
}
Introduction
So I was challanged by Adrian Antico to learn data.table
, so yesterday I started with a single function from my package {TidyDensity}
called tidy_bernoulli().
So let’s see how I did (hint, works but needs a lot of improvement, so I’ll learn it.)
Function
Let’s see the function in data.table
Example
Now, let’s see the output of the original function tidy_bernoulli() and new_func().
library(TidyDensity)
<- 50
n <- 0.1
pr <- 5
sims
set.seed(123)
<- tidy_bernoulli(.n = n, .prob = pr, .num_sims = sims)
tb
set.seed(123)
<- new_func(n = n, num_sims = sims, pr = pr)
nf
print(tb)
# A tibble: 250 × 7
sim_number x y dx dy p q
<fct> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 -0.405 0.0292 0.9 0
2 1 2 0 -0.368 0.0637 0.9 0
3 1 3 0 -0.331 0.129 0.9 0
4 1 4 0 -0.294 0.243 0.9 0
5 1 5 1 -0.258 0.424 1 1
6 1 6 0 -0.221 0.688 0.9 0
7 1 7 0 -0.184 1.03 0.9 0
8 1 8 0 -0.147 1.44 0.9 0
9 1 9 0 -0.110 1.87 0.9 0
10 1 10 0 -0.0727 2.25 0.9 0
# ℹ 240 more rows
print(nf)
sim_number x y dx dy p q
1: 1 1 0 -0.4053113 0.029196114 0.9 0
2: 1 2 0 -0.3683598 0.063683226 0.9 0
3: 1 3 0 -0.3314083 0.129227066 0.9 0
4: 1 4 0 -0.2944568 0.242967496 0.9 0
5: 1 5 1 -0.2575054 0.424395426 1.0 1
---
246: 5 46 0 1.2575054 0.057872104 0.9 0
247: 5 47 0 1.2944568 0.033131931 0.9 0
248: 5 48 1 1.3314083 0.017621873 1.0 1
249: 5 49 1 1.3683598 0.008684076 1.0 1
250: 5 50 0 1.4053113 0.003981288 0.9 0
Ok so at least the output is identical which is a good sign. Now let’s benchmark the two solutions.
library(rbenchmark)
library(dplyr)
benchmark(
"original" = {
tidy_bernoulli(.n = n, .prob = pr, .num_sims = sims)
},"data.table" = {
new_func(n = n, pr = pr, num_sims = sims)
},replications = 100,
columns = c("test","replications","elapsed","relative","user.self","sys.self" )
|>
) arrange(relative)
test replications elapsed relative user.self sys.self
1 original 100 3.18 1.000 2.89 0.12
2 data.table 100 4.89 1.538 4.49 0.11
Yeah, needs some work but it’s a start.