#####################################################
### Replication Script for the JSS Article:
### collapse: Advanced and Fast Statistical Computing
###           and Data Transformation in R
### By: Sebastian Krantz, IfW Kiel
### E-Mail: sebastian.krantz@ifw-kiel.de
#####################################################

###################################################
### code chunk number 0: Preliminaries
###################################################

options(prompt = "R> ", continue = "+  ", width = 77, digits = 4, useFancyQuotes = FALSE, warn = 1)

# Loading libraries and installing if unavailable
if(!requireNamespace("fastverse", quietly = TRUE)) install.packages("fastverse")
options(fastverse.styling = FALSE)
library(fastverse) # loads data.table, collapse, magrittr and kit (not used)
## -- Attaching packages ------------------------------------ fastverse 0.3.2 --
## v data.table 1.15.0     v kit        0.0.13
## v magrittr   2.0.3      v collapse   2.0.10
fastverse_extend(microbenchmark, Rfast, fixest, install = TRUE) # loads and installs if unavailable
## -- Attaching extension packages -------------------------- fastverse 0.3.2 --
## Warning: package 'Rcpp' was built under R version 4.3.1
## v microbenchmark 1.4.10     v fixest         0.11.3
## v Rfast          2.1.0
## -- Conflicts --------------------------------------- fastverse_conflicts() --
## x fixest::fdim()     masks collapse::fdim()
## x Rfast::group()     masks collapse::group()
## x Rfast::transpose() masks data.table::transpose()
# Package versions used in the article:
# fastverse 0.3.2, collapse 2.0.10, data.table 1.15.0, magrittr 2.0.3,
# microbenchmark 1.4.10, Rfast 2.1.0, and fixest 0.11.3

###################################################
### code chunk number 1: collapse Topics and Documentation
###################################################
.COLLAPSE_TOPICS
##  [1] "collapse-documentation"     "fast-statistical-functions"
##  [3] "fast-grouping-ordering"     "fast-data-manipulation"    
##  [5] "quick-conversion"           "advanced-aggregation"      
##  [7] "data-transformations"       "time-series-panel-series"  
##  [9] "list-processing"            "summary-statistics"        
## [11] "recode-replace"             "efficient-programming"     
## [13] "small-helpers"              "collapse-options"
help("collapse-documentation")


###################################################
### code chunk number 2: Fast Statistical Functions: Basic Examples
###################################################
fmean(mtcars$mpg)
## [1] 20.09
fmean(EuStockMarkets)
##  DAX  SMI  CAC FTSE 
## 2531 3376 2228 3566
fmean(mtcars[5:10])
##    drat      wt    qsec      vs      am    gear 
##  3.5966  3.2173 17.8488  0.4375  0.4062  3.6875
fmean(mtcars$mpg, w = mtcars$wt)
## [1] 18.55
fmean(mtcars$mpg, g = mtcars$cyl)
##     4     6     8 
## 26.66 19.74 15.10
fmean(mtcars$mpg, g = mtcars$cyl, w = mtcars$wt)
##     4     6     8 
## 25.94 19.65 14.81
fmean(mtcars[5:10], g = mtcars$cyl, w = mtcars$wt)
##    drat    wt  qsec     vs     am  gear
## 4 4.031 2.415 19.38 0.9149 0.6498 4.047
## 6 3.569 3.152 18.12 0.6212 0.3788 3.821
## 8 3.206 4.133 16.89 0.0000 0.1204 3.241
fmean(mtcars$mpg, g = mtcars$cyl, TRA = "fill") |> head(20)
##  [1] 19.74 19.74 26.66 19.74 15.10 19.74 15.10 26.66 26.66 19.74 19.74 15.10
## [13] 15.10 15.10 15.10 15.10 15.10 26.66 26.66 26.66
###################################################
### code chunk number 3: Airquality Dataset
###################################################
fnobs(airquality)
##   Ozone Solar.R    Wind    Temp   Month     Day 
##     116     146     153     153     153     153
###################################################
### code chunk number 4: Imputation by Reference
###################################################
fmedian(airquality[1:2], airquality$Month, TRA = "replace_na", set = TRUE)


###################################################
### code chunk number 5: Transformation Example
###################################################
airquality |> fmutate(
  rad_day = fsum(as.double(Solar.R), Day, TRA = "/"),
  ozone_deg = Ozone / Temp,
  ozone_amed = Ozone > fmedian(Ozone, Month, TRA = "fill"),
  ozone_resid = fmean(Ozone, list(Month, ozone_amed), ozone_deg, "-")
) |> head(3)
##   Ozone Solar.R Wind Temp Month Day rad_day ozone_deg ozone_amed ozone_resid
## 1    41     190  7.4   67     5   1   0.191    0.6119       TRUE     -10.279
## 2    36     118  8.0   72     5   2   0.135    0.5000       TRUE     -15.279
## 3    12     149 12.6   74     5   3   0.168    0.1622      FALSE      -3.035
###################################################
### code chunk number 6: GRP Objects
###################################################
str(g <- GRP(mtcars, ~ cyl + vs + am))
## Class 'GRP'  hidden list of 9
##  $ N.groups    : int 7
##  $ group.id    : int [1:32] 4 4 3 5 6 5 6 2 2 5 ...
##  $ group.sizes : int [1:7] 1 3 7 3 4 12 2
##  $ groups      :'data.frame':	7 obs. of  3 variables:
##   ..$ cyl: num [1:7] 4 4 4 6 6 8 8
##   ..$ vs : num [1:7] 0 1 1 0 1 0 0
##   ..$ am : num [1:7] 1 0 1 1 0 0 1
##  $ group.vars  : chr [1:3] "cyl" "vs" "am"
##  $ ordered     : Named logi [1:2] TRUE FALSE
##   ..- attr(*, "names")= chr [1:2] "ordered" "sorted"
##  $ order       : int [1:32] 27 8 9 21 3 18 19 20 26 28 ...
##   ..- attr(*, "starts")= int [1:7] 1 2 5 12 15 19 31
##   ..- attr(*, "maxgrpn")= int 12
##   ..- attr(*, "sorted")= logi FALSE
##  $ group.starts: int [1:7] 27 8 3 1 4 5 29
##  $ call        : language GRP.default(X = mtcars, by = ~cyl + vs + am)
###################################################
### code chunk number 7: Aggregation with GRP Objects
###################################################
dat <- get_vars(mtcars, c("mpg", "disp")); w <- mtcars$wt
add_vars(g$groups,
  fmean(dat, g, w, use.g.names = FALSE) |> add_stub("w_mean_"),
  fsd(dat, g, w, use.g.names = FALSE) |> add_stub("w_sd_")) |> head(2)
##   cyl vs am w_mean_mpg w_mean_disp w_sd_mpg w_sd_disp
## 1   4  0  1      26.00       120.3    0.000       0.0
## 2   4  1  0      23.02       137.1    1.236      11.6
###################################################
### code chunk number 8: Transformation with GRP Objects
###################################################
mtcars |> add_vars(fmean(dat, g, w, "-") |> add_stub("w_demean_"),
                   fscale(dat, g, w) |> add_stub("w_scale_")) |> head(2)
##               mpg cyl disp  hp drat    wt  qsec vs am gear carb w_demean_mpg
## Mazda RX4      21   6  160 110  3.9 2.620 16.46  0  1    4    4       0.4357
## Mazda RX4 Wag  21   6  160 110  3.9 2.875 17.02  0  1    4    4       0.4357
##               w_demean_disp w_scale_mpg w_scale_disp
## Mazda RX4             5.027      0.6657       0.6657
## Mazda RX4 Wag         5.027      0.6657       0.6657
###################################################
### code chunk number 9: fsummarise Integration
###################################################
mtcars |>
  fsubset(mpg > 11) |>
  fgroup_by(cyl, vs, am) |>
  fsummarise(across(c(mpg, carb, hp), fmean),
             qsec_w_med = fmean(qsec, wt)) |> head(2)
##   cyl vs am  mpg  carb    hp qsec_w_med
## 1   4  0  1 26.0 2.000 91.00      16.70
## 2   4  1  0 22.9 1.667 84.67      21.04
###################################################
### code chunk number 10: grouped_df Methods for Fast Statistical Functions
###################################################
mtcars |>
  fsubset(mpg > 11, cyl, vs, am, mpg, carb, hp, wt) |>
  fgroup_by(cyl, vs, am) |>
  fmean(wt) |> head(2)
##   cyl vs am sum.wt   mpg carb   hp
## 1   4  0  1  2.140 26.00 2.00 91.0
## 2   4  1  0  8.805 23.02 1.72 83.6
###################################################
### code chunk number 11: Vectorized Grouped Linear Regression
###################################################
mtcars |>
 fgroup_by(vs) |>
 fmutate(dm_carb = fmean(carb, TRA = "-")) |>
 fsummarise(slope = fsum(mpg, dm_carb) %/=% fsum(dm_carb^2))
##   vs   slope
## 1  0 -0.5557
## 2  1 -2.0706
###################################################
### code chunk number 12: Advanced Weighted Group Statistics
###################################################
mtcars |>
    fgroup_by(cyl, vs, am) |>
    fmutate(o = radixorder(GRPid(), mpg)) |>
    fsummarise(mpg_min = fmin(mpg),
               mpg_Q1 = fnth(mpg, 0.25, wt, o = o, ties = "q8"),
               mpg_mean = fmean(mpg, wt),
               mpg_median = fmedian(mpg, wt, o = o, ties = "q8"),
               mpg_mode = fmode(mpg, wt, ties = "max"),
               mpg_Q3 = fnth(mpg, 0.75, wt, o = o, ties = "q8"),
               mpg_max = fmax(mpg)) |> head(3)
##   cyl vs am mpg_min mpg_Q1 mpg_mean mpg_median mpg_mode mpg_Q3 mpg_max
## 1   4  0  1    26.0  26.00    26.00      26.00     26.0  26.00    26.0
## 2   4  1  0    21.5  21.90    23.02      23.16     24.4  24.38    24.4
## 3   4  1  1    21.4  22.37    27.74      28.28     30.4  31.51    33.9
###################################################
### code chunk number 13: Data Aggregation with collap()
###################################################
collap(wlddev, country + PCGDP + LIFEEX ~ year + income, w = ~ POP) |>
  head(4)
##         country year              income   PCGDP LIFEEX       POP
## 1 United States 1960         High income 12768.7  68.59 7.495e+08
## 2      Ethiopia 1960          Low income   658.5  38.33 1.474e+08
## 3         India 1960 Lower middle income   500.8  45.27 9.280e+08
## 4         China 1960 Upper middle income  1166.1  49.86 1.184e+09
###################################################
### code chunk number 14: Growth Rate of Airmiles Time Series
###################################################
fgrowth(airmiles) |> round(2)
## Time Series:
## Start = 1937 
## End = 1960 
## Frequency = 1 
##  [1]    NA 16.50 42.29 54.03 31.65  2.38 15.23 33.29 54.36 76.92  2.71 -2.10
## [13] 12.91 18.51 32.03 18.57 17.82 13.61 18.19 12.83 13.32  0.01 15.49  4.25
###################################################
### code chunk number 15: Creating an Irregular Series and Demonstrating Indexation
###################################################
am_ir <- airmiles[-c(3, 15)]
t <- time(airmiles)[-c(3, 15)]
fgrowth(am_ir, t = t) |> round(2)
##  [1]    NA 16.50    NA 31.65  2.38 15.23 33.29 54.36 76.92  2.71 -2.10 12.91
## [13] 18.51    NA 17.82 13.61 18.19 12.83 13.32  0.01 15.49  4.25
fgrowth(am_ir, -1:3, t = t) |> head(4)
##          FG1   --    G1  L2G1  L3G1
## [1,] -14.167  412    NA    NA    NA
## [2,]      NA  480 16.50    NA    NA
## [3,] -24.043 1052    NA 119.2 155.3
## [4,]  -2.327 1385 31.65    NA 188.5
###################################################
### code chunk number 16: Ad-Hoc Transformations on World Bank Panel Data Supplied with collapse
###################################################
G(wlddev, c(1, 10), by = POP + LIFEEX ~ iso3c, t = ~ year) |> head(3)
##   iso3c year G1.POP L10G1.POP G1.LIFEEX L10G1.LIFEEX
## 1   AFG 1960     NA        NA        NA           NA
## 2   AFG 1961  1.917        NA     1.590           NA
## 3   AFG 1962  1.985        NA     1.544           NA
settransform(wlddev, POP_growth = G(POP, g = iso3c, t = year))


###################################################
### code chunk number 17: Integration with Data Manipualtion Functions
###################################################
wlddev |> fgroup_by(iso3c) |> fselect(iso3c, year, POP, LIFEEX) |>
  fmutate(across(c(POP, LIFEEX), G, t = year)) |> head(2)
##   iso3c year     POP LIFEEX G1.POP G1.LIFEEX
## 1   AFG 1960 8996973  32.45     NA        NA
## 2   AFG 1961 9169410  32.96  1.917      1.59
###################################################
### code chunk number 18: Two Solutions for Grouped Scaling
###################################################
iris |> fgroup_by(Species) |> fscale() |> head(2)
##   Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1  setosa       0.2667      0.1899       -0.357     -0.4365
## 2  setosa      -0.3007     -1.1291       -0.357     -0.4365
STD(iris, ~ Species) |> head(2)
##   Species STD.Sepal.Length STD.Sepal.Width STD.Petal.Length STD.Petal.Width
## 1  setosa           0.2667          0.1899           -0.357         -0.4365
## 2  setosa          -0.3007         -1.1291           -0.357         -0.4365
###################################################
### code chunk number 19: Fixed Effects Regression a la Mundlak (1978)
###################################################
lm(mpg ~ carb + B(carb, cyl), data = mtcars) |> coef()
##  (Intercept)         carb B(carb, cyl) 
##      34.8297      -0.4655      -4.7750
###################################################
### code chunk number 20: Detrending with Country-Level Cubic Polynomials: Requires {fixest}
###################################################
HDW(wlddev, PCGDP + LIFEEX ~ iso3c * poly(year, 3), stub = F) |> head(2)
##    PCGDP   LIFEEX
## 1  8.885 0.023614
## 2 13.685 0.006724
###################################################
### code chunk number 21: Indexed Frame
###################################################
wldi <- wlddev |> findex_by(iso3c, year)
wldi |> fsubset(-3, iso3c, year, PCGDP:POP) |> G() |> head(4)
##   iso3c year G1.PCGDP G1.LIFEEX G1.GINI G1.ODA G1.POP
## 1   AFG 1960       NA        NA      NA     NA     NA
## 2   AFG 1961       NA     1.590      NA  98.75  1.917
## 3   AFG 1963       NA        NA      NA     NA     NA
## 4   AFG 1964       NA     1.448      NA  24.48  2.112
## 
## Indexed by:  iso3c [1] | year [4 (61)]
###################################################
### code chunk number 22: Indexed Series
###################################################
LIFEEXi = wldi$LIFEEX
str(LIFEEXi, width = 70, strict = "cut")
##  'indexed_series' num [1:13176] 32.4 33 33.5 34 34.5 ...
##  - attr(*, "label")= chr "Life expectancy at birth, total (years)"
##  - attr(*, "index_df")=Classes 'index_df', 'pindex' and 'data.frame'..
##   ..$ iso3c: Factor w/ 216 levels "ABW","AFG","AGO",..: 2 2 2 2 2 2 ..
##   .. ..- attr(*, "label")= chr "Country Code"
##   ..$ year : Ord.factor w/ 61 levels "1960"<"1961"<..: 1 2 3 4 5 6 7..
##   .. ..- attr(*, "label")= chr "Year"
c(is_irregular(LIFEEXi), is_irregular(LIFEEXi[-5]))
## [1] FALSE  TRUE
G(LIFEEXi[c(1:5, 7:10)])
## [1]    NA 1.590 1.544 1.494 1.448    NA 1.366 1.362 1.365
## 
## Indexed by:  iso3c [1] | year [9 (61)]
###################################################
### code chunk number 23: Demonstrating Deep Indexation
###################################################
settransform(wldi, PCGDP_ld = Dlog(PCGDP))
lm(D(LIFEEX) ~ L(PCGDP_ld, 0:5) + B(PCGDP_ld), wldi) |>
  summary() |> coef() |> round(3)
##                    Estimate Std. Error t value Pr(>|t|)
## (Intercept)           0.299      0.007  44.412    0.000
## L(PCGDP_ld, 0:5)--    0.300      0.080   3.735    0.000
## L(PCGDP_ld, 0:5)L1    0.269      0.081   3.332    0.001
## L(PCGDP_ld, 0:5)L2    0.227      0.079   2.854    0.004
## L(PCGDP_ld, 0:5)L3    0.200      0.078   2.563    0.010
## L(PCGDP_ld, 0:5)L4    0.143      0.076   1.871    0.061
## L(PCGDP_ld, 0:5)L5    0.095      0.073   1.301    0.193
## B(PCGDP_ld)          -1.021      0.316  -3.234    0.001
###################################################
### code chunk number 24: Using 3rd Party Functions: Rolling Average
###################################################
BY(LIFEEXi, findex(LIFEEXi)$iso3c, data.table::frollmean, 5) |> head(10)
##  [1]    NA    NA    NA    NA 33.46 33.96 34.46 34.95 35.43 35.92
## 
## Indexed by:  iso3c [1] | year [10 (61)]
###################################################
### code chunk number 25: Joins: Adding Join Column
###################################################
df1 <- data.frame(id1 = c(1, 1, 2, 3),
                  id2 = c("a", "b", "b", "c"),
                  name = c("John", "Jane", "Bob", "Carl"),
                  age = c(35, 28, 42, 50))
df2 <- data.frame(id1 = c(1, 2, 3, 3),
                  id2 = c("a", "b", "c", "e"),
                  salary = c(60000, 55000, 70000, 80000),
                  dept = c("IT", "Marketing", "Sales", "IT"))

join(df1, df2, on = c("id1", "id2"), how = "full", column = TRUE)
## full join: df1[id1, id2] 3/4 (75%) <m:m> df2[id1, id2] 3/4 (75%)
##   id1 id2 name age salary      dept   .join
## 1   1   a John  35  60000        IT matched
## 2   1   b Jane  28     NA      <NA>     df1
## 3   2   b  Bob  42  55000 Marketing matched
## 4   3   c Carl  50  70000     Sales matched
## 5   3   e <NA>  NA  80000        IT     df2
###################################################
### code chunk number 26: Validation + Join Attribute
###################################################
join(df1, df2, on = c("id1", "id2"), validate = "1:1", attr = "join") |>
  attr("join") |> str(width = 70, strict = "cut")
## left join: df1[id1, id2] 3/4 (75%) <1:1> df2[id1, id2] 3/4 (75%)
## List of 3
##  $ call   : language join(x = df1, y = df2, on = c("id1", "id2"), v"..
##  $ on.cols:List of 2
##   ..$ x: chr [1:2] "id1" "id2"
##   ..$ y: chr [1:2] "id1" "id2"
##  $ match  : 'qG' int [1:4] 1 NA 2 3
##   ..- attr(*, "N.nomatch")= int 1
##   ..- attr(*, "N.groups")= int 4
##   ..- attr(*, "N.distinct")= int 3
###################################################
### code chunk number 27: Overidentification Warning
###################################################
df2$name = df1$name
join(df1, df2) |> capture.output(type="m") |> strwrap(77) |> cat(sep="\n")
## Warning in fmatch(x[ixon], y[iyon], nomatch = NA_integer_, count = count, :
## Overidentified match/join: the first 2 of 3 columns uniquely match the
## records. With overid > 0, fmatch() continues to match columns. Consider
## removing columns or setting overid = 0 to terminate the algorithm after 2
## columns (the results may differ, see ?fmatch). Alternatively set overid = 2
## to silence this warning.
## left join: df1[id1, id2, name] 1/4 (25%) <m:m> df2[id1, id2, name] 1/4 (25%)
##   id1 id2 name age salary dept
## 1   1   a John  35  60000   IT
## 2   1   b Jane  28     NA <NA>
## 3   2   b  Bob  42     NA <NA>
## 4   3   c Carl  50     NA <NA>
###################################################
### code chunk number 28: Automatic Renaming
###################################################
join(df1, df2, on = c("id1", "id2"))
## left join: df1[id1, id2] 3/4 (75%) <m:m> df2[id1, id2] 3/4 (75%)
## duplicate columns: name => renamed using suffix '_df2' for y
##   id1 id2 name age salary      dept name_df2
## 1   1   a John  35  60000        IT     John
## 2   1   b Jane  28     NA      <NA>     <NA>
## 3   2   b  Bob  42  55000 Marketing     Jane
## 4   3   c Carl  50  70000     Sales      Bob
###################################################
### code chunk number 29: Data for Pivots
###################################################
data <- data.frame(type = rep(c("A", "B"), each = 2),
            type_name = rep(c("Apples", "Bananas"), each = 2),
            id = rep(1:2, 2), r = abs(rnorm(4)), h = abs(rnorm(4)*2))
setrelabel(data, id = "Fruit Id", r = "Fruit Radius", h = "Fruit Height")
print(data)
##   type type_name id      r      h
## 1    A    Apples  1 0.1465 3.1713
## 2    A    Apples  2 0.4870 1.7201
## 3    B   Bananas  1 0.7365 0.4584
## 4    B   Bananas  2 2.4150 0.9972
vlabels(data)
##           type      type_name             id              r              h 
##             NA             NA     "Fruit Id" "Fruit Radius" "Fruit Height"
###################################################
### code chunk number 30: Pivot Longer
###################################################
(dl <- pivot(data, ids = c("type", "type_name", "id"), labels = "label"))
##   type type_name id variable        label  value
## 1    A    Apples  1        r Fruit Radius 0.1465
## 2    A    Apples  2        r Fruit Radius 0.4870
## 3    B   Bananas  1        r Fruit Radius 0.7365
## 4    B   Bananas  2        r Fruit Radius 2.4150
## 5    A    Apples  1        h Fruit Height 3.1713
## 6    A    Apples  2        h Fruit Height 1.7201
## 7    B   Bananas  1        h Fruit Height 0.4584
## 8    B   Bananas  2        h Fruit Height 0.9972
vlabels(dl)
##       type  type_name         id   variable      label      value 
##         NA         NA "Fruit Id"         NA         NA         NA
###################################################
### code chunk number 31: Pivot Wider
###################################################
(dw <- pivot(data, "id", names = "type", labels = "type_name", how = "w"))
##   id    r_A    r_B   h_A    h_B
## 1  1 0.1465 0.7365 3.171 0.4584
## 2  2 0.4870 2.4150 1.720 0.9972
namlab(dw)
##   Variable                  Label
## 1       id               Fruit Id
## 2      r_A  Fruit Radius - Apples
## 3      r_B Fruit Radius - Bananas
## 4      h_A  Fruit Height - Apples
## 5      h_B Fruit Height - Bananas
###################################################
### code chunk number 32: Pivot Recast
###################################################
(dr <- pivot(data, ids = "id", names = list(from = "type"),
             labels = list(from = "type_name", to = "label"), how = "r"))
##   id variable        label      A      B
## 1  1        r Fruit Radius 0.1465 0.7365
## 2  2        r Fruit Radius 0.4870 2.4150
## 3  1        h Fruit Height 3.1713 0.4584
## 4  2        h Fruit Height 1.7201 0.9972
vlabels(dr)
##         id   variable      label          A          B 
## "Fruit Id"         NA         NA   "Apples"  "Bananas"
###################################################
### code chunk number 33: Recursive Splitting: Creates Nested List of Data Frames
###################################################
(dl <- mtcars |> rsplit(mpg + hp + carb ~ vs + am)) |> str(max.level = 2)
## List of 2
##  $ 0:List of 2
##   ..$ 0:'data.frame':	12 obs. of  3 variables:
##   ..$ 1:'data.frame':	6 obs. of  3 variables:
##  $ 1:List of 2
##   ..$ 0:'data.frame':	7 obs. of  3 variables:
##   ..$ 1:'data.frame':	7 obs. of  3 variables:
###################################################
### code chunk number 34: Fitting Linear Models and Obtaining Coefficient Matrices
###################################################
nest_lm_coef <- dl |>
  rapply2d(lm, formula = mpg ~ .) |>
  rapply2d(summary, classes = "lm") |>
  get_elem("coefficients")

nest_lm_coef |> str(give.attr = FALSE, strict = "cut")
## List of 2
##  $ 0:List of 2
##   ..$ 0: num [1:3, 1:4] 15.8791 0.0683 -4.5715 3.655 0.0345 ...
##   ..$ 1: num [1:3, 1:4] 26.9556 -0.0319 -0.308 2.293 0.0149 ...
##  $ 1:List of 2
##   ..$ 0: num [1:3, 1:4] 30.896903 -0.099403 -0.000332 3.346033 0.03587 ...
##   ..$ 1: num [1:3, 1:4] 37.0012 -0.1155 0.4762 7.3316 0.0894 ...
###################################################
### code chunk number 35: Unlisting to Data Frame
###################################################
nest_lm_coef |> unlist2d(c("vs", "am"), row.names = "variable") |> head(2)
##   vs am    variable Estimate Std. Error t value Pr(>|t|)
## 1  0  0 (Intercept) 15.87915    3.65495   4.345 0.001865
## 2  0  0          hp  0.06832    0.03449   1.981 0.078938
###################################################
### code chunk number 36: Removing Generated Series (Hidden)
###################################################
wldi <- wldi[1:13]

###################################################
### code chunk number 37: Which Columns/Countries have Time Varying Information?
###################################################
varying(wldi)
## country    date    year  decade  region  income    OECD   PCGDP  LIFEEX 
##   FALSE    TRUE    TRUE    TRUE   FALSE   FALSE   FALSE    TRUE    TRUE 
##    GINI     ODA     POP 
##    TRUE    TRUE    TRUE
varying(wldi, any_group = FALSE) |> head(3)
##     country date year decade region income  OECD PCGDP LIFEEX GINI  ODA  POP
## ABW   FALSE TRUE TRUE   TRUE  FALSE  FALSE FALSE  TRUE   TRUE   NA TRUE TRUE
## AFG   FALSE TRUE TRUE   TRUE  FALSE  FALSE FALSE  TRUE   TRUE   NA TRUE TRUE
## AGO   FALSE TRUE TRUE   TRUE  FALSE  FALSE FALSE  TRUE   TRUE TRUE TRUE TRUE
###################################################
### code chunk number 38: Demonstrating Panel Decomposition
###################################################
all.equal(fvar(W(LIFEEXi)) + fvar(B(LIFEEXi)), fvar(LIFEEXi))
## [1] TRUE
###################################################
### code chunk number 39: Panel Summary Statistics
###################################################
qsu(LIFEEXi)
##              N/T     Mean       SD      Min      Max
## Overall    11670  64.2963  11.4764   18.907  85.4171
## Between      207  64.9537   9.8936  40.9663  85.4171
## Within   56.3768  64.2963   6.0842  32.9068  84.4198
###################################################
### code chunk number 40: Weighted Panel Summary Statistics by Groups
###################################################
qsu(LIFEEXi, g = wlddev$OECD, w = wlddev$POP, higher = TRUE) |> aperm()
## , , FALSE
## 
##              N/T     Mean      SD      Min      Max     Skew    Kurt
## Overall     9503  63.5476  9.2368   18.907  85.4171  -0.7394  2.7961
## Between      171  63.5476  6.0788  43.0905  85.4171  -0.8041   3.082
## Within   55.5731  65.8807  6.9545  30.3388  82.8832  -1.0323  4.0998
## 
## , , TRUE
## 
##              N/T     Mean      SD      Min      Max     Skew    Kurt
## Overall     2156  74.9749  5.3627   45.369  84.3563  -1.2966  6.5505
## Between       36  74.9749  2.9256  66.2983  78.6733  -1.3534  4.5999
## Within   59.8889  65.8807  4.4944  44.9513  77.2733   -0.627  3.9839
###################################################
### code chunk number 41: Detailed (Grouped, Weighted) Statistical Description
###################################################
descr(wlddev, LIFEEX ~ OECD, w = ~ replace_na(POP))
## Dataset: wlddev, 1 Variables, N = 13176, WeightSum = 313233706778
## Grouped by: OECD [2]
##            N   Perc       WeightSum  Perc
## FALSE  10980  83.33  2.49344474e+11  79.6
## TRUE    2196  16.67  6.38892329e+10  20.4
## -----------------------------------------------------------------------------
## LIFEEX (numeric): Life expectancy at birth, total (years)
## Statistics (N = 11659, 11.51% NAs)
##           N   Perc  Ndist   Mean    SD    Min    Max   Skew  Kurt
## FALSE  9503  81.51   8665  63.55  9.24  18.91  85.42  -0.74   2.8
## TRUE   2156  18.49   2016  74.97  5.36  45.37  84.36   -1.3  6.55
## 
## Quantiles
##           1%     5%    10%    25%    50%    75%    90%    95%    99%
## FALSE  41.39  45.78  49.08  57.51  65.98  70.14  74.12  75.63  76.91
## TRUE   56.65  65.98   69.7  71.85  75.38  78.64  81.26  82.43   83.6
## -----------------------------------------------------------------------------
###################################################
### code chunk number 42: qtab: Basic Usage
###################################################
library(magrittr) # World after 2015 (latest country data)
wlda15 <- wlddev |> fsubset(year >= 2015) |> fgroup_by(iso3c) |> flast()
wlda15 %$% qtab(OECD, income)
##        income
## OECD    High income Low income Lower middle income Upper middle income
##   FALSE          45         30                  47                  58
##   TRUE           34          0                   0                   2
###################################################
### code chunk number 43: qtab: Population Counts
###################################################
wlda15 %$% qtab(OECD, income, w = POP) %>% divide_by(1e6)
##        income
## OECD    High income Low income Lower middle income Upper middle income
##   FALSE       93.01     694.89             3063.54             2459.71
##   TRUE      1098.75       0.00                0.00              211.01
###################################################
### code chunk number 44: qtab: Average Life Expectancy
###################################################
wlda15 %$% qtab(OECD, income, w = LIFEEX, wFUN = fmean) %>% replace_na(0)
##        income
## OECD    High income Low income Lower middle income Upper middle income
##   FALSE       78.75      62.81               68.30               73.81
##   TRUE        81.09       0.00                0.00               76.37
###################################################
### code chunk number 45: qtab: Population Weighted Average Life Expectancy
###################################################
wlda15 %$% qtab(OECD, income, w = LIFEEX, wFUN = fmean,
                wFUN.args = list(w = POP)) %>% replace_na(0)
##        income
## OECD    High income Low income Lower middle income Upper middle income
##   FALSE       77.91      63.81               68.76               75.93
##   TRUE        81.13       0.00                0.00               76.10
###################################################
### code chunk number 46: Benchmark: Statistics and Data Manipulation
###################################################
setDTthreads(4)
set_collapse(na.rm = FALSE, sort = FALSE, nthreads = 4)
set.seed(101)
m <- matrix(rnorm(1e7), ncol = 1000)
data <- qDT(replicate(100, rnorm(1e5), simplify = FALSE))
g <- sample.int(1e4, 1e5, TRUE)

microbenchmark(R = colMeans(m),
               Rfast = Rfast::colmeans(m, parallel = TRUE, cores = 4),
               collapse = fmean(m))
## Warning in microbenchmark(R = colMeans(m), Rfast = Rfast::colmeans(m,
## parallel = TRUE, : less accurate nanosecond times to avoid potential integer
## overflows
## Unit: milliseconds
##      expr    min     lq   mean median     uq   max neval
##         R 14.993 20.939 21.907 22.855 22.886 34.40   100
##     Rfast  1.900  2.750  3.247  2.872  2.894 17.56   100
##  collapse  1.304  1.365  1.688  1.413  1.506 11.55   100
microbenchmark(R = rowsum(data, g, reorder = FALSE),
               data.table = data[, lapply(.SD, sum), by = g],
               collapse = fsum(data, g))
## Unit: milliseconds
##        expr    min     lq   mean median     uq   max neval
##           R 25.605 26.828 28.462  27.96 29.312 35.27   100
##  data.table 20.276 26.763 29.495  28.69 30.079 98.10   100
##    collapse  3.723  4.654  5.357   4.95  5.247 12.75   100
add_vars(data) <- g
microbenchmark(data.table = data[, lapply(.SD, median), by = g],
               collapse = data |> fgroup_by(g) |> fmedian())
## Unit: milliseconds
##        expr    min     lq   mean median    uq   max neval
##  data.table 251.56 265.33 274.76 276.70 283.5 312.6   100
##    collapse  84.87  91.22  96.28  94.81 101.6 112.7   100
d <- data.table(g = unique(g), x = 1, y = 2, z = 3)
microbenchmark(data.table = d[data, on = "g"],
               collapse = join(data, d, on = "g", verbose = 0))
## Unit: milliseconds
##        expr    min    lq   mean median    uq     max neval
##  data.table 14.302 17.93 39.755 23.818 34.00 108.242   100
##    collapse  2.902  3.14  3.438  3.309  3.79   4.086   100
microbenchmark(data.table = melt(data, "g"),
               collapse = pivot(data, "g"))
## Unit: milliseconds
##        expr   min    lq  mean median    uq   max neval
##  data.table 13.90 17.76 26.84  19.48 21.39 74.73   100
##    collapse 13.89 17.40 26.00  18.98 20.48 86.71   100
settransform(data, id = rowid(g))
cols <- grep("^V", names(data), value = TRUE)
microbenchmark(data.table = dcast(data, g ~ id, value.var = cols),
          collapse = pivot(data, ids = "g", names = "id", how = "w"))
## Unit: milliseconds
##        expr   min    lq  mean median    uq   max neval
##  data.table 121.6 210.6 216.6  218.6 221.9 259.4   100
##    collapse 112.3 137.8 143.4  142.9 148.8 184.1   100
###################################################
### code chunk number 47: Benchmark: Unique Values and Matching
###################################################
set.seed(101)
g_int <- sample.int(1e3, 1e7, replace = TRUE)
char <- c(letters, LETTERS, month.abb, month.name)
char <- outer(char, char, paste0)
g_char <- sample(char, 1e7, replace = TRUE)
microbenchmark(base_int = unique(g_int), collapse_int = funique(g_int),
            base_char = unique(g_char), collapse_char = funique(g_char))
## Unit: milliseconds
##           expr    min     lq   mean median     uq    max neval
##       base_int  92.95  98.78 103.21 102.06 104.94 171.41   100
##   collapse_int  13.31  13.92  15.97  14.97  15.78  29.41   100
##      base_char 143.94 150.15 159.89 161.40 166.20 231.22   100
##  collapse_char  33.99  35.14  38.79  36.72  39.88 101.54   100
microbenchmark(base_int = match(g_int, 1:1000),
               collapse_int = fmatch(g_int, 1:1000),
               base_char = match(g_char, char),
               data.table_char = chmatch(g_char, char),
               collapse_char = fmatch(g_char, char), times = 10)
## Unit: milliseconds
##             expr    min     lq   mean median     uq    max neval
##         base_int  42.31  42.87  48.17  46.14  49.44  69.59    10
##     collapse_int  13.35  13.55  15.54  14.73  17.24  21.11    10
##        base_char 123.00 124.22 134.12 128.18 139.92 176.19    10
##  data.table_char  66.00  66.25  70.52  66.51  73.50  89.45    10
##    collapse_char  43.25  43.34  47.29  43.60  46.45  64.42    10
###################################################
### Print Session Information
###################################################

sessionInfo()
## R version 4.3.0 (2023-04-21)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] fixest_0.11.3         Rfast_2.1.0           RcppParallel_5.1.7   
##  [4] RcppZiggurat_0.1.6    Rcpp_1.0.12           microbenchmark_1.4.10
##  [7] collapse_2.0.10       kit_0.0.13            magrittr_2.0.3       
## [10] data.table_1.15.0     fastverse_0.3.2      
## 
## loaded via a namespace (and not attached):
##  [1] Formula_1.2-5       numDeriv_2016.8-1.1 xfun_0.39          
##  [4] lattice_0.21-8      stringmagic_1.0.0   zoo_1.8-12         
##  [7] knitr_1.43          parallel_4.3.0      dreamerr_1.4.0     
## [10] sandwich_3.1-0      grid_4.3.0          compiler_4.3.0     
## [13] rstudioapi_0.14     tools_4.3.0         nlme_3.1-162       
## [16] evaluate_0.21