knitr::opts_chunk$set(echo = TRUE, warnings = FALSE, message = FALSE)
setwd("C:/Users/keess/Box/ASA Share/ASA 2020/Resources/Tufts-ASA_R_Stats_Modules/TNDS 2021 workshop")
library(tidyverse)
library(readxl)
library(haven)
library(plotrix)
library(expss)
library(moments)
# load("Input_Data/workshop_data.RData")
# write_csv(x = workshop_data, "Input_Data/workshop_data.csv")
## read in a csv file
df <- read_csv("Input_Data/workshop_data.csv")
names(df) # variable names in df
## [1] "study_id" "study_arm" "agedays" "agemonths" "gender"
## [6] "cgage" "edu" "av_weight" "av_len" "av_muac"
## [11] "_zwei" "_zwfl" "hfias_score" "hfias_cat"
nrow(df) # number of rows in df
## [1] 2653
ncol(df) # number of columns in df
## [1] 14
str(df) # structure
## tibble [2,653 x 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ study_id : num [1:2653] 1 2 3 4 5 6 7 8 9 10 ...
## $ study_arm : num [1:2653] 1 1 1 1 1 1 1 1 1 1 ...
## $ agedays : num [1:2653] 399 477 1506 429 527 ...
## $ agemonths : num [1:2653] 13.1 15.7 49.5 14.1 17.3 ...
## $ gender : num [1:2653] 1 1 1 1 0 0 0 1 0 0 ...
## $ cgage : num [1:2653] 21 25 46 29 20 25 23 20 35 25 ...
## $ edu : num [1:2653] 0 3 0 0 0 0 3 3 0 1 ...
## $ av_weight : num [1:2653] 7.14 7.2 12.79 6.76 6.93 ...
## $ av_len : num [1:2653] 70.6 70.6 103.3 68.2 72.3 ...
## $ av_muac : num [1:2653] 11.8 11.9 11.9 12 12 ...
## $ _zwei : num [1:2653] -2.95 -3.33 -2.06 -3.59 -3.19 -2.39 -2.91 -2.6 -2.89 -2.55 ...
## $ _zwfl : num [1:2653] -2.26 -2.16 -2.89 -2.14 -2.53 -1.55 -1.96 -1.45 -2.15 -1.63 ...
## $ hfias_score: num [1:2653] 16 21 15 0 10 12 15 16 11 26 ...
## $ hfias_cat : num [1:2653] 4 4 4 1 4 3 4 4 3 4 ...
## - attr(*, "spec")=
## .. cols(
## .. study_id = col_double(),
## .. study_arm = col_double(),
## .. agedays = col_double(),
## .. agemonths = col_double(),
## .. gender = col_double(),
## .. cgage = col_double(),
## .. edu = col_double(),
## .. av_weight = col_double(),
## .. av_len = col_double(),
## .. av_muac = col_double(),
## .. `_zwei` = col_double(),
## .. `_zwfl` = col_double(),
## .. hfias_score = col_double(),
## .. hfias_cat = col_double()
## .. )
## find all unique values of gender
unique(df$gender)
## [1] 1 0 -99
# find the number of unique values of age in days
n_distinct(df$agedays)[1:10]
## [1] 690 NA NA NA NA NA NA NA NA NA
# the number of rows with complete observations
sum(complete.cases(df))
## [1] 2623
# the number of missing observations with agedays
sum(is.na(df$agedays))
## [1] 13
## lapply can take a function (like unique), and apply it to all columns of a dataframe (df)
unique_vals <- lapply(df, unique)
distinct_vals <- lapply(df, n_distinct)
## missing vals for each column:
missing <- lapply(df, function(x) sum(is.na(x)))
# base R:
df$ageyears = df$agedays/365.25
# same code in tidyverse
# df_w_age <- df %>%
# mutate(ageyears = agedays/365.25)
## make labels from package "expss"
df_w_labels <- df %>%
apply_labels(ageyears = "age in years",
agemonths = "age in months",
agedays = "age in days",
edu = "education")
## labelling observations not really done in R (I didn't think they were done anywhere)
## but if you want to do it, use the rownames() function
## you can also include a label variable that corresponds to being a unique identifier for each
## row
# ?rownames()
# ?colnames()
# simple base R summary function
summary(df)
## study_id study_arm agedays agemonths
## Min. : 1 Min. :1.000 Min. : 171.0 Min. : 5.622
## 1st Qu.:2595 1st Qu.:2.000 1st Qu.: 231.0 1st Qu.: 7.595
## Median :5115 Median :3.000 Median : 326.0 Median :10.718
## Mean :4430 Mean :2.611 Mean : 403.1 Mean :13.254
## 3rd Qu.:7624 3rd Qu.:4.000 3rd Qu.: 493.2 3rd Qu.:16.216
## Max. :8500 Max. :4.000 Max. :1809.0 Max. :59.474
## NA's :13 NA's :13
## gender cgage edu av_weight
## Min. :-99.0000 Min. :-99.00 Min. :-99.0000 Min. : 4.085
## 1st Qu.: 0.0000 1st Qu.: 21.00 1st Qu.: 0.0000 1st Qu.: 5.955
## Median : 0.0000 Median : 26.00 Median : 0.0000 Median : 6.510
## Mean : 0.3129 Mean : 26.59 Mean : 0.4124 Mean : 6.677
## 3rd Qu.: 1.0000 3rd Qu.: 32.00 3rd Qu.: 1.0000 3rd Qu.: 7.220
## Max. : 1.0000 Max. : 80.00 Max. : 5.0000 Max. :15.010
## NA's :1
## av_len av_muac _zwei _zwfl
## Min. : 53.15 Min. :11.50 Min. :-6.140 Min. :-4.840
## 1st Qu.: 63.55 1st Qu.:11.73 1st Qu.:-3.390 1st Qu.:-2.270
## Median : 66.85 Median :11.97 Median :-2.870 Median :-1.790
## Mean : 68.03 Mean :11.97 Mean :-2.882 Mean :-1.799
## 3rd Qu.: 71.45 3rd Qu.:12.20 3rd Qu.:-2.330 3rd Qu.:-1.320
## Max. :108.20 Max. :12.47 Max. : 1.260 Max. : 1.370
## NA's :3 NA's :17 NA's :8
## hfias_score hfias_cat ageyears
## Min. : 0.000 Min. :1.000 Min. :0.4682
## 1st Qu.: 1.000 1st Qu.:1.000 1st Qu.:0.6324
## Median :10.000 Median :4.000 Median :0.8925
## Mean : 9.451 Mean :3.083 Mean :1.1037
## 3rd Qu.:14.000 3rd Qu.:4.000 3rd Qu.:1.3504
## Max. :27.000 Max. :4.000 Max. :4.9528
## NA's :9 NA's :9 NA's :13
# individual summary functions
mean(df$agemonths, na.rm = T)
## [1] 13.254
median(df$agemonths, na.rm = T)
## [1] 10.7178
sd(df$agemonths, na.rm = T)
## [1] 7.900913
# no explicit function for the mode of a vector, but we can make one
getmode <- function(x){
uniquev <- unique(x)
uniquev[which.max(tabulate(match(x, uniquev)))]
}
hist(df$agemonths, breaks = 100, xlim = c(min(df$agemonths, na.rm = T), max(df$agemonths, na.rm = T)))
getmode(df$agemonths)
## [1] 6.082185
## REMEMBER: getmode is not an actual R function, it is made up for the purposes of displaying R's function syntax
range(df$agemonths, na.rm = T)
## [1] 5.621912 59.473907
min(df$agemonths, na.rm = T)
## [1] 5.621912
max(df$agemonths, na.rm = T)
## [1] 59.47391
## skewness and kurtosis are from the "moments" package
skewness(df$agemonths, na.rm = T)
## [1] 2.149153
kurtosis(df$agemonths, na.rm = T)
## [1] 9.280115
quantile(df$agemonths, probs = c(.10, .25, .50, .75, .90), na.rm = T)
## 10% 25% 50% 75% 90%
## 6.542459 7.594512 10.717796 16.216421 22.921619
# ?quantile()
mytable <- table(df$gender, df$edu)
# tidyverse function
count(df, gender, sort = T)
## # A tibble: 3 x 2
## gender n
## <dbl> <int>
## 1 0 1523
## 2 1 1127
## 3 -99 3
rownames(mytable) = c("missing", "male", "female")
colnames(mytable) = c("missing", "none", "primary school", "secondary school",
"some high school", "some college", "graduate school")
# summarize table across rows
margin.table(mytable, 1)
##
## missing male female
## 3 1523 1127
# summarize table across columns
margin.table(mytable, 2)
##
## missing none primary school secondary school
## 14 1419 574 59
## some high school some college graduate school
## 563 21 3
# calculate row percentages
round(prop.table(mytable, 1), 3)
##
## missing none primary school secondary school some high school
## missing 0.000 0.667 0.000 0.000 0.333
## male 0.005 0.538 0.221 0.020 0.207
## female 0.005 0.530 0.211 0.025 0.219
##
## some college graduate school
## missing 0.000 0.000
## male 0.007 0.001
## female 0.009 0.001
# calculate column percentages
round(prop.table(mytable, 2), 3)
##
## missing none primary school secondary school some high school
## missing 0.000 0.001 0.000 0.000 0.002
## male 0.571 0.578 0.585 0.525 0.560
## female 0.429 0.421 0.415 0.475 0.439
##
## some college graduate school
## missing 0.000 0.000
## male 0.524 0.667
## female 0.476 0.333
## tidyverse version. ifelse is a vectorized if/else statement, meaning it applies
## if/else logic across an entire column of a dataframe. R's native if/else logic can only
## be applied to one data point at a time, and therefore ifelse is often more efficient when
## working with data frames
df_cat <- df %>%
mutate(hfias_cat = ifelse(hfias_score < 1, 4,
ifelse(hfias_score >= 1 & hfias_score < 10, 3,
ifelse(hfias_score >= 10 & hfias_score < 14, 2,
ifelse(hfias_score >= 14, 1, NA)))),
hfias_cat = factor(hfias_cat))
## base r version. We are indexing all of the columns where our conditions are satisfied, and
## assigning them a category based on the condition
df$hfias_cat[df$hfias_cat < 1] <- 4
df$hfias_cat[between(df$hfias_cat, 1, 10)] <- 3
df$hfias_cat[between(df$hfias_cat, 10, 14)] <- 2
df$hfias_cat[df$hfias_cat >= 14] <- 1
## save your output as a permanent RData file with the save function
save(df_cat, file = "Output Data/output_df.RData")
## or save as a csv
write.csv(df_cat, file = "Output Data/output_df.csv")
## csv files are more memory intensive than RData files, so if you are working with large data sets,
## use RData if you are exclusively working in R
n_distinct(df$hfias_score)
## [1] 29
## use breaks to specify the number of bins you want in your histogram.
hist(df$hfias_score, breaks = 29, main = "Histogram of HFIAS Score",
freq = TRUE, col = "blue")
## Comparison tests (t-test and Wilcoxon)
df_ttest <- df[df$gender == 0 | df$gender == 1,]
# simple unpaired t test
t.test(df_ttest$av_len ~ df_ttest$gender, conf.level = 0.95, paired = FALSE, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: df_ttest$av_len by df_ttest$gender
## t = -5.4411, df = 2337.4, p-value = 5.848e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.7902305 -0.8416832
## sample estimates:
## mean in group 0 mean in group 1
## 67.46464 68.78060
# simple Wilcoxon rank sum
wilcox.test(df_ttest$av_len ~ df_ttest$gender, conf.int = T, conf.level = 0.95, exact = F,
correct = T)
##
## Wilcoxon rank sum test with continuity correction
##
## data: df_ttest$av_len by df_ttest$gender
## W = 736851, p-value = 8.454e-10
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -1.7500611 -0.9000049
## sample estimates:
## difference in location
## -1.300048
mean(df$av_len, na.rm = T)
## [1] 68.02908
p_corr <- cor(df$av_weight, df$av_len, use = "complete.obs", method = c("pearson"))
s_corr <- cor(df$av_weight, df$av_len, use = "complete.obs", method = c("spearman"))
p_corr
## [1] 0.9449736
s_corr
## [1] 0.9309364
## using complete observations for the sake of simplicity
df_mod <- df[complete.cases(df),]
# model of average length compared to weight
length_model <- lm(av_len ~ av_weight, data = df_mod)
summary(length_model) # get description of model
##
## Call:
## lm(formula = av_len ~ av_weight, data = df_mod)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9180 -1.2868 -0.0845 1.2361 10.2290
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.05976 0.25404 118.3 <2e-16 ***
## av_weight 5.68820 0.03765 151.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.967 on 2621 degrees of freedom
## Multiple R-squared: 0.897, Adjusted R-squared: 0.897
## F-statistic: 2.283e+04 on 1 and 2621 DF, p-value: < 2.2e-16
# calculate prediction intervals for your model
pred_ints <- predict(length_model, interval = "prediction", level = 0.95)
## Warning in predict.lm(length_model, interval = "prediction", level = 0.95): predictions on current data refer to _future_ responses
# calculate confidence intervals as well
conf_ints <- predict(length_model, interval = "confidence", level = 0.95)
# plot model with prediction and confidence intervals, along with model fit
plot(df_mod$av_weight, df_mod$av_len, main = "Plot of Average Length vs. Average weight")
abline(length_model, col = "red", lwd = 3)
lines(df_mod$av_weight, pred_ints[,2], col = "skyblue", lty = 2)
lines(df_mod$av_weight, pred_ints[,3], col = "skyblue", lty = 2)
lines(df_mod$av_weight, conf_ints[,2], col = "blue", lty = 3)
lines(df_mod$av_weight, conf_ints[,3], col = "blue", lty = 3)
library(car)
library(MASS)
library(corrplot)
# lets look at the correlations between variables
corrplot(cor(df_mod[c("av_len", "av_weight", "hfias_score", "av_muac")], method = "pearson"))
mv_length_model <- lm(av_len ~ av_weight + gender + edu + hfias_score + av_muac, data = df_mod)
summary(mv_length_model)
##
## Call:
## lm(formula = av_len ~ av_weight + gender + edu + hfias_score +
## av_muac, data = df_mod)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.3677 -1.2086 -0.0165 1.1371 9.8643
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 55.095246 1.648829 33.415 < 2e-16 ***
## av_weight 5.897488 0.038395 153.602 < 2e-16 ***
## gender -0.321244 0.074935 -4.287 1.88e-05 ***
## edu 0.006073 0.005161 1.177 0.239
## hfias_score 0.006921 0.005239 1.321 0.187
## av_muac -2.203009 0.143019 -15.404 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.878 on 2617 degrees of freedom
## Multiple R-squared: 0.9063, Adjusted R-squared: 0.9061
## F-statistic: 5062 on 5 and 2617 DF, p-value: < 2.2e-16
## to check linear model assumptions
vif(mv_length_model) # variance inflation factor
## av_weight gender edu hfias_score av_muac
## 1.141297 1.020360 1.005315 1.008059 1.117906
plot(mv_length_model) # plots residuals vs fitted, normal q-q, cook's distance, and std residuals
crPlots(mv_length_model, terms = ~av_weight + hfias_score + av_muac, smooth = T)
# shows the smoothed relationship of each variable