Skip to content

Commit 7f32214

Browse files
authored
NSDUH Data Example (#15)
* Data creation program * Add docs * Add to readme * Update labels, zap formats
1 parent 5a3456c commit 7f32214

File tree

6 files changed

+301
-24
lines changed

6 files changed

+301
-24
lines changed

R/data.R

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,35 @@
287287
#' @source \url{https://electionstudies.org/data-center/2020-time-series-study/}
288288
"anes_2020"
289289

290+
#' @title NATIONAL SURVEY ON DRUG USE AND HEALTH (NSDUH) (2023) data
291+
#' @description A subset of variables from the NSDUH 2023 Public Use File
292+
#' @format A data frame with 56705 rows and 22 variables:
293+
#' \describe{
294+
#' \item{\code{QUESTID2}}{double RESPONDENT IDENTIFICATION}
295+
#' \item{\code{ANALWT2_C}}{double FIN PRSN-LEVEL SMPLE WGHT 2}
296+
#' \item{\code{VESTR_C}}{double VARIANCE STRATUM}
297+
#' \item{\code{VEREP}}{double VARIANCE PRIMARY SAMPLING UNIT}
298+
#' \item{\code{NICVAPMON}}{integer RC-NICOTINE VAPING - PAST MONTH USE}
299+
#' \item{\code{TOBMON}}{integer RC-ANY TOBACCO - PAST MONTH USE}
300+
#' \item{\code{ALCMON}}{integer RC-ALCOHOL - PAST MONTH USE}
301+
#' \item{\code{ILLMON}}{integer RC-ANY ILLICIT DRUG - PAST MONTH USE}
302+
#' \item{\code{ILTOBVAPALC}}{integer RC-TOBACCO, NICOTINE VAPING, ALCOHOL, ILLICIT DRUG - PST MON}
303+
#' \item{\code{BNGDRKMON}}{integer RC-BINGE ALCOHOL USE PAST 30 DAYS}
304+
#' \item{\code{IRPYUD5ALC}}{integer ALCOHOL USE DISORDER IN THE PAST YEAR - IMP REV}
305+
#' \item{\code{UD5ILLANY}}{integer RC-DRUG USE DISORDER - PAST YEAR USERS}
306+
#' \item{\code{UD5ILALANY}}{integer RC-DRUG OR ALCOHOL USE DISORDER - PAST YEAR USERS}
307+
#' \item{\code{YMDELT}}{factor RC-YOUTH: LIFETIME MAJOR DEPRESSIVE EPISODE (MDE)}
308+
#' \item{\code{YMDEYR}}{factor RC-YOUTH: PAST YEAR MAJOR DEPRESSIVE EPISODE (MDE)}
309+
#' \item{\code{MDEIMPY}}{factor RC-YOUTH: MDE WITH SEVERE ROLE IMPAIRMENT}
310+
#' \item{\code{AMIPY}}{integer RC-IMP AMI IND (1/0) BASED ON PREDICTED SMI PROB PY}
311+
#' \item{\code{SMIPY}}{integer RC-IMP SMI IND (1/0) BASED ON PREDICTED SMI PROB PY}
312+
#' \item{\code{AGE3}}{factor RECODE - FINAL EDITED AGE}
313+
#' \item{\code{NEWRACE2}}{factor RC-RACE/HISPANICITY RECODE (7 LEVELS)}
314+
#' \item{\code{IRSEX}}{factor SEX AT BIRTH - IMPUTATION REVISED}
315+
#' \item{\code{POVERTY3}}{factor RC-POVERTY LEVEL-NEW INC (% OF US CENSUS POVERTY THRESHOLD)}
316+
#' }
317+
#' @source \url{https://www.samhsa.gov/data/data-we-collect/nsduh-national-survey-drug-use-and-health/datafiles}
318+
"nsduh_2023"
290319

291320
#' @title California Health Interview Survey (CHIS) (2023) data
292321
#' @description A subset of variables from the CHIS 2023 Public Use File
@@ -313,4 +342,4 @@
313342
#' \item{\code{RAKEDW1 - RAKEDW80}}{CHIS2023 RAKED WEIGHT - REPLICATE 1 through REPLICATE 80}
314343
#' }
315344
#' @source \url{https://healthpolicy.ucla.edu/our-work/public-use-files/one-year-public-use-files-pufs}
316-
"chis_2023"
345+
"chis_2023"

README.Rmd

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ knitr::opts_chunk$set(
2020
fig.align = "center",
2121
tidy = "styler"
2222
)
23-
library(tidyverse)
2423
```
2524

2625
The **srvyexploR** package provides datasets used in the book [Exploring Complex Survey Data Analysis Using R: A Tidy Introduction with {srvyr} and {survey}](https://tidy-survey-r.github.io/tidy-survey-book/). This will help readers follow along with the examples and work through the exercises.
@@ -77,6 +76,16 @@ head(ncvs_2021_person)
7776
head(ncvs_2021_incident)
7877
```
7978

79+
### NSDUH
80+
81+
The National Survey on Drug Use and Health (NSDUH) is an annual survey of the civilian, non-institutionalized population in the United States who are at least 12 years old. Topics include substance use (tobacco, alcohol, and illicit drugs including marijuana), mental health, and general health. This package provides a subset of the variables from the 2023 Public Use File. For more details about the study and the data, refer to the [Methodological Summary and Definitions](https://www.samhsa.gov/data/sites/default/files/reports/rpt47098/Methodological%20Summary%20and%20Definitions/2023-nsduh-method-summary-defs.pdf), [Data User's Guide](https://www.samhsa.gov/data/sites/default/files/reports/rpt56198/2023-nsduh-puf-data-users-guide.pdf), and [Codebook](https://www.samhsa.gov/data/system/files/media-puf-file/NSDUH-2023-DS0001-info-codebook_v1.pdf).
82+
83+
```{r}
84+
#| label: nsduh-show
85+
86+
head(nsduh_2023)
87+
```
88+
8089
### RECS
8190

8291
Three files are included associated with RECS - a dataset with the 2015 data with some derived variables created for the book (`recs_2015`), the 2020 data with some derived variables created for the book (`recs_2020`), and the 2020 data with the original variables (`recs_2020_raw`). RECS is a survey about energy consumption and expenditure among residential households in the United States and has been conducted since 1979 by the Energy Information Administration. More information about the original data is available at the [RECS website](https://www.eia.gov/consumption/residential/data/2020/).
@@ -152,7 +161,7 @@ Anyone interested in redistributing the ANES data should refer to the [ANES FAQ
152161

153162
ANES:
154163

155-
+ American National Election Studies. 2021. ANES 2020 Time Series Study Full Release [dataset and documentation]. July 19, 2021 version. https://www.electionstudies.org
164+
+ American National Election Studies, 2021. ANES 2020 Time Series Study Full Release [dataset and documentation]. July 19, 2021 version. https://www.electionstudies.org
156165

157166
CHIS:
158167

@@ -162,7 +171,12 @@ NCVS:
162171

163172
+ United States. Bureau of Justice Statistics. National Crime Victimization Survey, [United States], 2021. Inter-university Consortium for Political and Social Research [distributor], 2022-09-19. https://doi.org/10.3886/ICPSR38429.v1
164173

174+
NSDUH:
175+
176+
+ Center for Behavioral Health Statistics and Quality, 2025. 2023 National Survey on Drug Use
177+
and Health: Public use file data users’ guide. https://www.samhsa.gov/data/data-wecollect/nsduh/datafiles
178+
165179
RECS:
166180

167-
+ U.S. Energy Information Administration. 2024. Residential Energy Consumption 2020 Survey Data. [dataset and documentation]. January 2024 version. https://www.eia.gov/consumption/residential/data/2020/index.php?view=microdata
168-
+ U.S. Energy Information Administration. 2018 Residential Energy Consumption 2015 Survey Data. [dataset and documentation]. December 2018 version. https://www.eia.gov/consumption/residential/data/2015/index.php?view=microdata
181+
+ U.S. Energy Information Administration, 2024. Residential Energy Consumption 2020 Survey Data. [dataset and documentation]. January 2024 version. https://www.eia.gov/consumption/residential/data/2020/index.php?view=microdata
182+
+ U.S. Energy Information Administration, 2018 Residential Energy Consumption 2015 Survey Data. [dataset and documentation]. December 2018 version. https://www.eia.gov/consumption/residential/data/2015/index.php?view=microdata

README.md

Lines changed: 110 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -47,22 +47,76 @@ Once the package is loaded, you can use the data immediately as follows:
4747

4848
``` r
4949
head(anes_2020)
50-
#> # A tibble: 6 × 65
51-
#> V200001 CaseID V200002 InterviewMode V200010b Weight V200010c VarUnit V200010d
52-
#> <dbl> <dbl> <hvn_l> <fct> <dbl> <dbl> <dbl> <fct> <dbl>
53-
#> 1 200015 200015 3 Web 1.01 1.01 2 2 9
54-
#> 2 200022 200022 3 Web 1.16 1.16 2 2 26
55-
#> 3 200039 200039 3 Web 0.769 0.769 1 1 41
56-
#> 4 200046 200046 3 Web 0.521 0.521 2 2 29
57-
#> 5 200053 200053 3 Web 0.966 0.966 1 1 23
58-
#> 6 200060 200060 3 Web 0.235 0.235 2 2 37
59-
#> # ℹ 56 more variables: Stratum <fct>, V201006 <hvn_lbll>,
60-
#> # CampaignInterest <fct>, V201023 <hvn_lbll>, EarlyVote2020 <fct>,
61-
#> # V201024 <hvn_lbll>, V201025x <hvn_lbll>, V201028 <hvn_lbll>,
62-
#> # V201029 <hvn_lbll>, V201101 <hvn_lbll>, V201102 <hvn_lbll>,
63-
#> # VotedPres2016 <fct>, V201103 <hvn_lbll>, VotedPres2016_selection <fct>,
64-
#> # V201228 <hvn_lbll>, V201229 <hvn_lbll>, V201230 <hvn_lbll>,
65-
#> # V201231x <hvn_lbll>, PartyID <fct>, V201233 <hvn_lbll>, …
50+
#> V200001 CaseID V200002 InterviewMode V200010b Weight V200010c VarUnit
51+
#> 1 200015 200015 3 Web 1.0057375 1.0057375 2 2
52+
#> 2 200022 200022 3 Web 1.1634731 1.1634731 2 2
53+
#> 3 200039 200039 3 Web 0.7686811 0.7686811 1 1
54+
#> 4 200046 200046 3 Web 0.5210195 0.5210195 2 2
55+
#> 5 200053 200053 3 Web 0.9657892 0.9657892 1 1
56+
#> 6 200060 200060 3 Web 0.2347078 0.2347078 2 2
57+
#> V200010d Stratum V201006 CampaignInterest V201023 EarlyVote2020 V201024
58+
#> 1 9 9 2 Somewhat interested -1 <NA> -1
59+
#> 2 26 26 3 Not much interested -1 <NA> -1
60+
#> 3 41 41 2 Somewhat interested -1 <NA> -1
61+
#> 4 29 29 3 Not much interested -1 <NA> -1
62+
#> 5 23 23 2 Somewhat interested -1 <NA> -1
63+
#> 6 37 37 1 Very much interested -1 <NA> -1
64+
#> V201025x V201028 V201029 V201101 V201102 VotedPres2016 V201103
65+
#> 1 3 -1 -1 -1 1 Yes 2
66+
#> 2 3 -1 -1 -1 1 Yes 5
67+
#> 3 3 -1 -1 -1 1 Yes 1
68+
#> 4 3 -1 -1 -1 1 Yes 1
69+
#> 5 3 -1 -1 -1 1 Yes 2
70+
#> 6 3 -1 -1 -1 2 No -1
71+
#> VotedPres2016_selection V201228 V201229 V201230 V201231x
72+
#> 1 Trump 2 1 -1 7
73+
#> 2 Other 5 -1 2 4
74+
#> 3 Clinton 3 -1 3 3
75+
#> 4 Clinton 2 2 -1 6
76+
#> 5 Trump 3 -1 2 4
77+
#> 6 <NA> 3 -1 3 3
78+
#> PartyID V201233 TrustGovernment V201237
79+
#> 1 Strong republican 5 Never 3
80+
#> 2 Independent 5 Never 4
81+
#> 3 Independent-democrat 4 Some of the time 4
82+
#> 4 Not very strong republican 3 About half the time 2
83+
#> 5 Independent 5 Never 4
84+
#> 6 Independent-democrat 4 Some of the time 2
85+
#> TrustPeople V201507x Age AgeGroup V201510 Education V201546
86+
#> 1 About half the time 46 46 40-49 6 Bachelor's 1
87+
#> 2 Some of the time 37 37 30-39 3 Post HS 2
88+
#> 3 Some of the time 40 40 40-49 2 High school 2
89+
#> 4 Most of the time 41 41 40-49 4 Post HS 2
90+
#> 5 Some of the time 72 72 70 or older 8 Graduate 2
91+
#> 6 Most of the time 71 71 70 or older 3 Post HS 2
92+
#> V201547a V201547b V201547c V201547d V201547e V201547z V201549x RaceEth
93+
#> 1 -3 -3 -3 -3 -3 -3 3 Hispanic
94+
#> 2 -3 -3 -3 -3 -3 -3 4 Asian, NH/PI
95+
#> 3 -3 -3 -3 -3 -3 -3 1 White
96+
#> 4 -3 -3 -3 -3 -3 -3 4 Asian, NH/PI
97+
#> 5 -3 -3 -3 -3 -3 -3 5 AI/AN
98+
#> 6 -3 -3 -3 -3 -3 -3 1 White
99+
#> V201600 Gender V201607 V201610 V201611 V201613 V201615 V201616 V201617x
100+
#> 1 1 Male -3 -3 -3 -3 -3 -3 21
101+
#> 2 2 Female -3 -3 -3 -3 -3 -3 13
102+
#> 3 2 Female -3 -3 -3 -3 -3 -3 17
103+
#> 4 1 Male -3 -3 -3 -3 -3 -3 7
104+
#> 5 1 Male -3 -3 -3 -3 -3 -3 22
105+
#> 6 2 Female -3 -3 -3 -3 -3 -3 3
106+
#> Income Income7 V202051 V202066 V202072 VotedPres2020
107+
#> 1 $175,000-249,999 $125k or more -1 1 -1 <NA>
108+
#> 2 $70,000-74,999 $60k to < 80k -1 4 1 Yes
109+
#> 3 $100,000-109,999 $100k to < 125k -1 4 1 Yes
110+
#> 4 $35,000-39,999 $20k to < 40k -1 4 1 Yes
111+
#> 5 $250,000 or more $125k or more -1 4 1 Yes
112+
#> 6 $15,000-19,999 Under $20k -1 4 1 Yes
113+
#> V202073 V202109x V202110x VotedPres2020_selection
114+
#> 1 -1 0 -1 <NA>
115+
#> 2 3 1 3 Other
116+
#> 3 1 1 1 Biden
117+
#> 4 1 1 1 Biden
118+
#> 5 2 1 2 Trump
119+
#> 6 1 1 1 Biden
66120
```
67121

68122
See `?anes_2020` for more information about the data.
@@ -129,6 +183,37 @@ head(ncvs_2021_incident)
129183
#> # V4267 <fct>, V4268 <fct>, V4269 <fct>, V4270 <fct>, V4271 <fct>, …
130184
```
131185

186+
### NSDUH
187+
188+
The National Survey on Drug Use and Health (NSDUH) is an annual survey
189+
of the civilian, non-institutionalized population in the United States
190+
who are at least 12 years old. Topics include substance use (tobacco,
191+
alcohol, and illicit drugs including marijuana), mental health, and
192+
general health. This package provides a subset of the variables from the
193+
2023 Public Use File. For more details about the study and the data,
194+
refer to the [Methodological Summary and
195+
Definitions](https://www.samhsa.gov/data/sites/default/files/reports/rpt47098/Methodological%20Summary%20and%20Definitions/2023-nsduh-method-summary-defs.pdf),
196+
[Data User’s
197+
Guide](https://www.samhsa.gov/data/sites/default/files/reports/rpt56198/2023-nsduh-puf-data-users-guide.pdf),
198+
and
199+
[Codebook](https://www.samhsa.gov/data/system/files/media-puf-file/NSDUH-2023-DS0001-info-codebook_v1.pdf).
200+
201+
``` r
202+
head(nsduh_2023)
203+
#> # A tibble: 6 × 22
204+
#> QUESTID2 ANALWT2_C VESTR_C VEREP NICVAPMON TOBMON ALCMON ILLMON ILTOBVAPALC
205+
#> <dbl> <dbl> <dbl> <dbl> <int> <int> <int> <int> <int>
206+
#> 1 10000053 3276. 40031 2 0 0 1 0 1
207+
#> 2 10000679 15630. 40021 2 0 1 1 0 1
208+
#> 3 10001208 4018. 40043 1 0 1 0 1 1
209+
#> 4 10001260 10712. 40030 2 0 0 0 0 0
210+
#> 5 10001588 8195. 40023 2 0 0 1 0 1
211+
#> 6 10004996 3771. 40048 1 1 1 1 0 1
212+
#> # ℹ 13 more variables: BNGDRKMON <int>, IRPYUD5ALC <int>, UD5ILLANY <int>,
213+
#> # UD5ILALANY <int>, YMDELT <fct>, YMDEYR <fct>, MDEIMPY <fct>, AMIPY <int>,
214+
#> # SMIPY <int>, AGE3 <fct>, NEWRACE2 <fct>, IRSEX <fct>, POVERTY3 <fct>
215+
```
216+
132217
### RECS
133218

134219
Three files are included associated with RECS - a dataset with the 2015
@@ -309,7 +394,7 @@ Anyone interested in redistributing the ANES data should refer to the
309394

310395
ANES:
311396

312-
- American National Election Studies. 2021. ANES 2020 Time Series Study
397+
- American National Election Studies, 2021. ANES 2020 Time Series Study
313398
Full Release \[dataset and documentation\]. July 19, 2021 version.
314399
<https://www.electionstudies.org>
315400

@@ -326,13 +411,19 @@ NCVS:
326411
Consortium for Political and Social Research \[distributor\],
327412
2022-09-19. <https://doi.org/10.3886/ICPSR38429.v1>
328413

414+
NSDUH:
415+
416+
- Center for Behavioral Health Statistics and Quality, 2025. 2023
417+
National Survey on Drug Use and Health: Public use file data users’
418+
guide. <https://www.samhsa.gov/data/data-wecollect/nsduh/datafiles>
419+
329420
RECS:
330421

331-
- U.S. Energy Information Administration. 2024. Residential Energy
422+
- U.S. Energy Information Administration, 2024. Residential Energy
332423
Consumption 2020 Survey Data. \[dataset and documentation\]. January
333424
2024 version.
334425
<https://www.eia.gov/consumption/residential/data/2020/index.php?view=microdata>
335-
- U.S. Energy Information Administration. 2018 Residential Energy
426+
- U.S. Energy Information Administration, 2018 Residential Energy
336427
Consumption 2015 Survey Data. \[dataset and documentation\]. December
337428
2018 version.
338429
<https://www.eia.gov/consumption/residential/data/2015/index.php?view=microdata>

data-raw/nsduh_2023.R

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
## code to prepare `nsduh_2023` dataset goes here
2+
3+
library(tidyverse)
4+
5+
if (!dir.exists("nsduh-temp")) dir.create(here::here("nsduh-temp"))
6+
download.file(
7+
"https://www.samhsa.gov/data/system/files/media-puf-file/NSDUH-2023-DS0001-bndl-data-r_v1.zip",
8+
here::here("nsduh-temp", "nsduh-2023.zip")
9+
)
10+
11+
unzip(here::here("nsduh-temp", "nsduh-2023.zip"),
12+
exdir = here::here("nsduh-temp")
13+
)
14+
15+
load(here::here("nsduh-temp", "NSDUH_2023.Rdata"))
16+
17+
varinfo <- tibble(
18+
Variables = names(puf2023_102124),
19+
Label = map_chr(names(puf2023_102124), ~ attr(puf2023_102124[[.x]], "label"))
20+
)
21+
22+
openxlsx2::write_xlsx(varinfo, here::here("nsduh-temp", "variable-list.xlsx"))
23+
24+
fct_yesno_12 <- function(x) {
25+
factor(
26+
if_else(x %in% c(1:2), x, NA),
27+
labels = c("Yes", "No")
28+
)
29+
}
30+
31+
nsduh_slim <- puf2023_102124 %>%
32+
select(
33+
QUESTID2, ANALWT2_C, VESTR_C, VEREP,
34+
NICVAPMON, TOBMON, ALCMON, ILLMON, ILTOBVAPALC, BNGDRKMON,
35+
IRPYUD5ALC, UD5ILLANY, UD5ILALANY,
36+
YMDELT, YMDEYR, MDEIMPY,
37+
AMIPY, SMIPY,
38+
AGE3, NEWRACE2, IRSEX, POVERTY3
39+
) %>%
40+
mutate(
41+
across(c(NICVAPMON, TOBMON, ALCMON, ILLMON, ILTOBVAPALC, BNGDRKMON, IRPYUD5ALC, UD5ILLANY, UD5ILALANY, AMIPY, SMIPY), as.integer),
42+
across(c(YMDELT, YMDEYR, MDEIMPY), fct_yesno_12),
43+
AGE3 = factor(AGE3, labels = c("12-13", "14-15", "16-17", "18-20", "21-23", "24-25", "26-29", "30-34", "35-49", "50-64", "65+")),
44+
NEWRACE2 = factor(NEWRACE2, labels = c("White, NH", "Black, NH", "Native Am/AK Native, NH", "Native HI/PI, NH", "Asian, NH", "More than one race, NH", "Other")),
45+
IRSEX = factor(IRSEX, labels = c("Male", "Female")),
46+
POVERTY3 = factor(POVERTY3, labels = c("0-100% FPL", "101-200% FPL", "201%+ FPL"))
47+
)
48+
49+
check_vars <- function(var) {
50+
message(var)
51+
table(nsduh_slim[[var]], puf2023_102124[[var]], useNA = "ifany") %>%
52+
print()
53+
}
54+
55+
nsduh_slim %>%
56+
select(where(is.factor)) %>%
57+
names() %>%
58+
walk(check_vars)
59+
60+
nsduh_slim %>%
61+
select(-where(is.factor))
62+
63+
# Update labels and zap formats
64+
65+
update_label <- function(var) {
66+
attr(nsduh_slim[[var]], "label") <<- attr(puf2023_102124[[var]], "label")
67+
}
68+
69+
walk(names(nsduh_slim), update_label)
70+
71+
haven::zap_formats(nsduh_slim)
72+
73+
str(nsduh_slim)
74+
75+
nsduh_slim_md <- tibble(
76+
Variable = names(nsduh_slim),
77+
Class = sapply(nsduh_slim, class),
78+
Label = map_chr(names(nsduh_slim), \(x) attr(nsduh_slim[[x]], "label"))
79+
) %>%
80+
mutate(
81+
Class2 = map_chr(Class, ~ str_flatten(.x, collapse = ";")),
82+
Class2 = if_else(Class2 == "numeric", "double", Class2)
83+
)
84+
85+
nsduh_slim_md %>%
86+
mutate(
87+
roxy = str_c("#' \\item{\\code{", Variable, "}}{", Class2, " ", Label, "}")
88+
) %>%
89+
pull(roxy) %>%
90+
cat(sep = "\n")
91+
92+
nsduh_2023 <- nsduh_slim
93+
94+
summary(nsduh_2023)
95+
nrow(nsduh_2023)
96+
ncol(nsduh_2023)
97+
98+
usethis::use_data(nsduh_2023, overwrite = TRUE)
99+
100+
unlink(here::here("nsduh-temp"), recursive = TRUE)

data/nsduh_2023.rda

872 KB
Binary file not shown.

0 commit comments

Comments
 (0)