R/micro_read_chunked.r
read_ipums_micro_chunked.Rd
Reads a dataset downloaded from the IPUMS extract system, but does so by reading a chunk, then applying your code to that chunk and then continuing, which can allow you to deal with data that is too large to store in your computer's RAM all at once.
read_ipums_micro_chunked( ddi, callback, chunk_size = 10000, vars = NULL, data_file = NULL, verbose = TRUE, var_attrs = c("val_labels", "var_label", "var_desc"), lower_vars = FALSE ) read_ipums_micro_list_chunked( ddi, callback, chunk_size = 10000, vars = NULL, data_file = NULL, verbose = TRUE, var_attrs = c("val_labels", "var_label", "var_desc"), lower_vars = FALSE )
ddi | Either a filepath to a DDI xml file downloaded from
the website, or a |
---|---|
callback | An |
chunk_size | An integer indicating how many observations to read in per chunk (defaults to 10,000). Setting this higher uses more RAM, but will usually be faster. |
vars | Names of variables to load. Accepts a character vector of names, or
|
data_file | Specify a directory to look for the data file. If left empty, it will look in the same directory as the DDI file. |
verbose | Logical, indicating whether to print progress information to console. |
var_attrs | Variable attributes to add from the DDI, defaults to
adding all (val_labels, var_label and var_desc). See
|
lower_vars | Only if reading a DDI from a file, a logical indicating
whether to convert variable names to lowercase (default is FALSE, in line
with IPUMS conventions). Note that this argument will be ignored if
argument |
Depends on the callback object
Other ipums_read:
read_ipums_micro_yield()
,
read_ipums_micro()
,
read_ipums_sf()
,
read_nhgis()
,
read_terra_area()
,
read_terra_micro()
,
read_terra_raster()
# Select Minnesotan cases from CPS example (Note you can also accomplish # this and avoid having to even download a huge file using the "Select Cases" # functionality of the IPUMS extract system) mn_only <- read_ipums_micro_chunked( ipums_example("cps_00006.xml"), IpumsDataFrameCallback$new(function(x, pos) { x[x$STATEFIP == 27, ] }), chunk_size = 1000 # Generally you want this larger, but this example is a small file ) #> Use of data from IPUMS-CPS is subject to conditions including that users should #> cite the data appropriately. Use command `ipums_conditions()` for more details. # Tabulate INCTOT average by state without storing full dataset in memory library(dplyr) #> #> Attaching package: 'dplyr' #> The following objects are masked from 'package:stats': #> #> filter, lag #> The following objects are masked from 'package:base': #> #> intersect, setdiff, setequal, union inc_by_state <- read_ipums_micro_chunked( ipums_example("cps_00006.xml"), IpumsDataFrameCallback$new(function(x, pos) { x %>% mutate( INCTOT = lbl_na_if( INCTOT, ~.lbl %in% c("Missing.", "N.I.U. (Not in Universe).")) ) %>% filter(!is.na(INCTOT)) %>% group_by(STATEFIP = as_factor(STATEFIP)) %>% summarize(INCTOT_SUM = sum(INCTOT), n = n(), .groups = "drop") }), chunk_size = 1000 # Generally you want this larger, but this example is a small file ) %>% group_by(STATEFIP) %>% summarize(avg_inc = sum(INCTOT_SUM) / sum(n)) #> Use of data from IPUMS-CPS is subject to conditions including that users should #> cite the data appropriately. Use command `ipums_conditions()` for more details. # x will be a list when using `read_ipums_micro_list_chunked()` read_ipums_micro_list_chunked( ipums_example("cps_00010.xml"), IpumsSideEffectCallback$new(function(x, pos) { print(paste0(nrow(x$PERSON), " persons and ", nrow(x$HOUSEHOLD), " households in this chunk.")) }), chunk_size = 1000 # Generally you want this larger, but this example is a small file ) #> Use of data from IPUMS-CPS is subject to conditions including that users should #> cite the data appropriately. Use command `ipums_conditions()` for more details. #> [1] "699 persons and 301 households in this chunk." #> [1] "701 persons and 299 households in this chunk." #> [1] "693 persons and 307 households in this chunk." #> [1] "685 persons and 315 households in this chunk." #> [1] "696 persons and 304 households in this chunk." #> [1] "691 persons and 309 households in this chunk." #> [1] "695 persons and 305 households in this chunk." #> [1] "691 persons and 309 households in this chunk." #> [1] "694 persons and 306 households in this chunk." #> [1] "692 persons and 308 households in this chunk." #> [1] "692 persons and 308 households in this chunk." #> [1] "39 persons and 14 households in this chunk." #> NULL # Using the biglm package, you can even run a regression without storing # the full dataset in memory library(dplyr) if (require(biglm)) { lm_results <- read_ipums_micro_chunked( ipums_example("cps_00015.xml"), IpumsBiglmCallback$new( INCTOT ~ AGE + HEALTH, # Simple regression (may not be very useful) function(x, pos) { x %>% mutate( INCTOT = lbl_na_if( INCTOT, ~.lbl %in% c("Missing.", "N.I.U. (Not in Universe).") ), HEALTH = as_factor(HEALTH) ) }), chunk_size = 1000 # Generally you want this larger, but this example is a small file ) summary(lm_results) } #> Loading required package: biglm #> Loading required package: DBI #> Use of data from IPUMS-CPS is subject to conditions including that users should #> cite the data appropriately. Use command `ipums_conditions()` for more details. #> Large data regression model: biglm(INCTOT ~ AGE + HEALTH, data, ...) #> Sample size = 8194 #> Coef (95% CI) SE p #> (Intercept) 25351.6183 21728.2210 28975.016 1811.6986 0.0000 #> AGE 499.7783 427.5196 572.037 36.1293 0.0000 #> HEALTHVery good -2135.1060 -5431.8110 1161.599 1648.3525 0.1952 #> HEALTHGood -10480.2543 -14052.9835 -6907.525 1786.3646 0.0000 #> HEALTHFair -23091.1061 -28274.2254 -17907.987 2591.5596 0.0000 #> HEALTHPoor -34341.0066 -42611.9852 -26070.028 4135.4893 0.0000