Set up libraries and working directory

knitr::opts_chunk$set(echo = TRUE, warnings = FALSE, message = FALSE)

setwd("C:/Users/keess/Box/ASA Share/ASA 2020/Resources/Tufts-ASA_R_Stats_Modules/TNDS 2021 workshop")

library(tidyverse)
library(readxl)
library(haven)
library(plotrix)
library(expss)
library(moments)

Read in data

# load("Input_Data/workshop_data.RData")
# write_csv(x = workshop_data, "Input_Data/workshop_data.csv")

## read in a csv file
df <- read_csv("Input_Data/workshop_data.csv")

Describe, view, and edit data once imported

names(df) # variable names in df
##  [1] "study_id"    "study_arm"   "agedays"     "agemonths"   "gender"     
##  [6] "cgage"       "edu"         "av_weight"   "av_len"      "av_muac"    
## [11] "_zwei"       "_zwfl"       "hfias_score" "hfias_cat"
nrow(df) # number of rows in df
## [1] 2653
ncol(df) # number of columns in df
## [1] 14
str(df) # structure
## tibble [2,653 x 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ study_id   : num [1:2653] 1 2 3 4 5 6 7 8 9 10 ...
##  $ study_arm  : num [1:2653] 1 1 1 1 1 1 1 1 1 1 ...
##  $ agedays    : num [1:2653] 399 477 1506 429 527 ...
##  $ agemonths  : num [1:2653] 13.1 15.7 49.5 14.1 17.3 ...
##  $ gender     : num [1:2653] 1 1 1 1 0 0 0 1 0 0 ...
##  $ cgage      : num [1:2653] 21 25 46 29 20 25 23 20 35 25 ...
##  $ edu        : num [1:2653] 0 3 0 0 0 0 3 3 0 1 ...
##  $ av_weight  : num [1:2653] 7.14 7.2 12.79 6.76 6.93 ...
##  $ av_len     : num [1:2653] 70.6 70.6 103.3 68.2 72.3 ...
##  $ av_muac    : num [1:2653] 11.8 11.9 11.9 12 12 ...
##  $ _zwei      : num [1:2653] -2.95 -3.33 -2.06 -3.59 -3.19 -2.39 -2.91 -2.6 -2.89 -2.55 ...
##  $ _zwfl      : num [1:2653] -2.26 -2.16 -2.89 -2.14 -2.53 -1.55 -1.96 -1.45 -2.15 -1.63 ...
##  $ hfias_score: num [1:2653] 16 21 15 0 10 12 15 16 11 26 ...
##  $ hfias_cat  : num [1:2653] 4 4 4 1 4 3 4 4 3 4 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   study_id = col_double(),
##   ..   study_arm = col_double(),
##   ..   agedays = col_double(),
##   ..   agemonths = col_double(),
##   ..   gender = col_double(),
##   ..   cgage = col_double(),
##   ..   edu = col_double(),
##   ..   av_weight = col_double(),
##   ..   av_len = col_double(),
##   ..   av_muac = col_double(),
##   ..   `_zwei` = col_double(),
##   ..   `_zwfl` = col_double(),
##   ..   hfias_score = col_double(),
##   ..   hfias_cat = col_double()
##   .. )
## find all unique values of gender
unique(df$gender)
## [1]   1   0 -99
# find the number of unique values of age in days
n_distinct(df$agedays)[1:10]
##  [1] 690  NA  NA  NA  NA  NA  NA  NA  NA  NA
# the number of rows with complete observations
sum(complete.cases(df))
## [1] 2623
# the number of missing observations with agedays
sum(is.na(df$agedays))
## [1] 13
## lapply can take a function (like unique), and apply it to all columns of a dataframe (df)
unique_vals <- lapply(df, unique)
distinct_vals <- lapply(df, n_distinct)
## missing vals for each column:
missing <- lapply(df, function(x) sum(is.na(x)))

Generate and label variables and observations

# base R:
df$ageyears = df$agedays/365.25

# same code in tidyverse
# df_w_age <- df %>%
#   mutate(ageyears = agedays/365.25)


## make labels from package "expss"
df_w_labels <- df %>%
  apply_labels(ageyears = "age in years",
               agemonths = "age in months",
               agedays = "age in days",
               edu = "education")


## labelling observations not really done in R (I didn't think they were done anywhere)
## but if you want to do it, use the rownames() function
## you can also include a label variable that corresponds to being a unique identifier for each
## row
# ?rownames()
# ?colnames()

Get summary statistics

# simple base R summary function
summary(df)
##     study_id      study_arm        agedays         agemonths     
##  Min.   :   1   Min.   :1.000   Min.   : 171.0   Min.   : 5.622  
##  1st Qu.:2595   1st Qu.:2.000   1st Qu.: 231.0   1st Qu.: 7.595  
##  Median :5115   Median :3.000   Median : 326.0   Median :10.718  
##  Mean   :4430   Mean   :2.611   Mean   : 403.1   Mean   :13.254  
##  3rd Qu.:7624   3rd Qu.:4.000   3rd Qu.: 493.2   3rd Qu.:16.216  
##  Max.   :8500   Max.   :4.000   Max.   :1809.0   Max.   :59.474  
##                                 NA's   :13       NA's   :13      
##      gender             cgage             edu             av_weight     
##  Min.   :-99.0000   Min.   :-99.00   Min.   :-99.0000   Min.   : 4.085  
##  1st Qu.:  0.0000   1st Qu.: 21.00   1st Qu.:  0.0000   1st Qu.: 5.955  
##  Median :  0.0000   Median : 26.00   Median :  0.0000   Median : 6.510  
##  Mean   :  0.3129   Mean   : 26.59   Mean   :  0.4124   Mean   : 6.677  
##  3rd Qu.:  1.0000   3rd Qu.: 32.00   3rd Qu.:  1.0000   3rd Qu.: 7.220  
##  Max.   :  1.0000   Max.   : 80.00   Max.   :  5.0000   Max.   :15.010  
##                                                         NA's   :1       
##      av_len          av_muac          _zwei            _zwfl       
##  Min.   : 53.15   Min.   :11.50   Min.   :-6.140   Min.   :-4.840  
##  1st Qu.: 63.55   1st Qu.:11.73   1st Qu.:-3.390   1st Qu.:-2.270  
##  Median : 66.85   Median :11.97   Median :-2.870   Median :-1.790  
##  Mean   : 68.03   Mean   :11.97   Mean   :-2.882   Mean   :-1.799  
##  3rd Qu.: 71.45   3rd Qu.:12.20   3rd Qu.:-2.330   3rd Qu.:-1.320  
##  Max.   :108.20   Max.   :12.47   Max.   : 1.260   Max.   : 1.370  
##  NA's   :3                        NA's   :17       NA's   :8       
##   hfias_score       hfias_cat        ageyears     
##  Min.   : 0.000   Min.   :1.000   Min.   :0.4682  
##  1st Qu.: 1.000   1st Qu.:1.000   1st Qu.:0.6324  
##  Median :10.000   Median :4.000   Median :0.8925  
##  Mean   : 9.451   Mean   :3.083   Mean   :1.1037  
##  3rd Qu.:14.000   3rd Qu.:4.000   3rd Qu.:1.3504  
##  Max.   :27.000   Max.   :4.000   Max.   :4.9528  
##  NA's   :9        NA's   :9       NA's   :13
# individual summary functions
mean(df$agemonths, na.rm = T)
## [1] 13.254
median(df$agemonths, na.rm = T)
## [1] 10.7178
sd(df$agemonths, na.rm = T)
## [1] 7.900913
# no explicit function for the mode of a vector, but we can make one
getmode <- function(x){
  uniquev <- unique(x)
  uniquev[which.max(tabulate(match(x, uniquev)))]
}
hist(df$agemonths, breaks = 100, xlim = c(min(df$agemonths, na.rm = T), max(df$agemonths, na.rm = T)))

getmode(df$agemonths)
## [1] 6.082185
## REMEMBER: getmode is not an actual R function, it is made up for the purposes of displaying R's function syntax

range(df$agemonths, na.rm = T)
## [1]  5.621912 59.473907
min(df$agemonths, na.rm = T)
## [1] 5.621912
max(df$agemonths, na.rm = T)
## [1] 59.47391
## skewness and kurtosis are from the "moments" package
skewness(df$agemonths, na.rm = T)
## [1] 2.149153
kurtosis(df$agemonths, na.rm = T)
## [1] 9.280115
quantile(df$agemonths, probs = c(.10, .25, .50, .75, .90), na.rm = T)
##       10%       25%       50%       75%       90% 
##  6.542459  7.594512 10.717796 16.216421 22.921619
# ?quantile()

Frequency table with row/column percentages

mytable <- table(df$gender, df$edu)

# tidyverse function
count(df, gender, sort = T)
## # A tibble: 3 x 2
##   gender     n
##    <dbl> <int>
## 1      0  1523
## 2      1  1127
## 3    -99     3
rownames(mytable) = c("missing", "male", "female")
colnames(mytable) = c("missing", "none", "primary school", "secondary school",
                      "some high school", "some college", "graduate school")

# summarize table across rows
margin.table(mytable, 1)
## 
## missing    male  female 
##       3    1523    1127
# summarize table across columns
margin.table(mytable, 2)
## 
##          missing             none   primary school secondary school 
##               14             1419              574               59 
## some high school     some college  graduate school 
##              563               21                3
# calculate row percentages 
round(prop.table(mytable, 1), 3)
##          
##           missing  none primary school secondary school some high school
##   missing   0.000 0.667          0.000            0.000            0.333
##   male      0.005 0.538          0.221            0.020            0.207
##   female    0.005 0.530          0.211            0.025            0.219
##          
##           some college graduate school
##   missing        0.000           0.000
##   male           0.007           0.001
##   female         0.009           0.001
# calculate column percentages
round(prop.table(mytable, 2), 3)
##          
##           missing  none primary school secondary school some high school
##   missing   0.000 0.001          0.000            0.000            0.002
##   male      0.571 0.578          0.585            0.525            0.560
##   female    0.429 0.421          0.415            0.475            0.439
##          
##           some college graduate school
##   missing        0.000           0.000
##   male           0.524           0.667
##   female         0.476           0.333

Convert continuous to categorical variables with if/then statements

## tidyverse version. ifelse is a vectorized if/else statement, meaning it applies
## if/else logic across an entire column of a dataframe. R's native if/else logic can only
## be applied to one data point at a time, and therefore ifelse is often more efficient when
## working with data frames
df_cat <- df %>%
  mutate(hfias_cat = ifelse(hfias_score < 1, 4,
                            ifelse(hfias_score >= 1 & hfias_score < 10, 3,
                                   ifelse(hfias_score >= 10 & hfias_score < 14, 2,
                                          ifelse(hfias_score >= 14, 1, NA)))),
         hfias_cat = factor(hfias_cat))


## base r version. We are indexing all of the columns where our conditions are satisfied, and 
## assigning them a category based on the condition
df$hfias_cat[df$hfias_cat < 1] <- 4
df$hfias_cat[between(df$hfias_cat, 1, 10)] <- 3
df$hfias_cat[between(df$hfias_cat, 10, 14)] <- 2
df$hfias_cat[df$hfias_cat >= 14] <- 1

## save your output as a permanent RData file with the save function
save(df_cat, file = "Output Data/output_df.RData")

## or save as a csv
write.csv(df_cat, file = "Output Data/output_df.csv")

## csv files are more memory intensive than RData files, so if you are working with large data sets,
## use RData if you are exclusively working in R

Producing a histogram to inspect variables

n_distinct(df$hfias_score)
## [1] 29
## use breaks to specify the number of bins you want in your histogram.
hist(df$hfias_score, breaks = 29, main = "Histogram of HFIAS Score",
     freq = TRUE, col = "blue")

## Comparison tests (t-test and Wilcoxon)

df_ttest <- df[df$gender == 0 | df$gender == 1,]

# simple unpaired t test
t.test(df_ttest$av_len ~ df_ttest$gender, conf.level = 0.95, paired = FALSE, var.equal = FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  df_ttest$av_len by df_ttest$gender
## t = -5.4411, df = 2337.4, p-value = 5.848e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.7902305 -0.8416832
## sample estimates:
## mean in group 0 mean in group 1 
##        67.46464        68.78060
# simple Wilcoxon rank sum
wilcox.test(df_ttest$av_len ~ df_ttest$gender, conf.int = T, conf.level = 0.95, exact = F,
            correct = T)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  df_ttest$av_len by df_ttest$gender
## W = 736851, p-value = 8.454e-10
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
##  -1.7500611 -0.9000049
## sample estimates:
## difference in location 
##              -1.300048

Measures of association (pearson vs. spearman)

mean(df$av_len, na.rm = T)
## [1] 68.02908
p_corr <- cor(df$av_weight, df$av_len, use = "complete.obs", method = c("pearson"))
s_corr <- cor(df$av_weight, df$av_len, use = "complete.obs", method = c("spearman"))
p_corr
## [1] 0.9449736
s_corr
## [1] 0.9309364

Simple Linear Regression

## using complete observations for the sake of simplicity
df_mod <- df[complete.cases(df),]

# model of average length compared to weight
length_model <- lm(av_len ~ av_weight, data = df_mod)
summary(length_model) # get description of model
## 
## Call:
## lm(formula = av_len ~ av_weight, data = df_mod)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.9180 -1.2868 -0.0845  1.2361 10.2290 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 30.05976    0.25404   118.3   <2e-16 ***
## av_weight    5.68820    0.03765   151.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.967 on 2621 degrees of freedom
## Multiple R-squared:  0.897,  Adjusted R-squared:  0.897 
## F-statistic: 2.283e+04 on 1 and 2621 DF,  p-value: < 2.2e-16
# calculate prediction intervals for your model
pred_ints <- predict(length_model, interval = "prediction", level = 0.95)
## Warning in predict.lm(length_model, interval = "prediction", level = 0.95): predictions on current data refer to _future_ responses
# calculate confidence intervals as well
conf_ints <- predict(length_model, interval = "confidence", level = 0.95)

# plot model with prediction and confidence intervals, along with model fit
plot(df_mod$av_weight, df_mod$av_len, main = "Plot of Average Length vs. Average weight")
abline(length_model, col = "red", lwd = 3)
lines(df_mod$av_weight, pred_ints[,2], col = "skyblue", lty = 2)
lines(df_mod$av_weight, pred_ints[,3], col = "skyblue", lty = 2)
lines(df_mod$av_weight, conf_ints[,2], col = "blue", lty = 3)
lines(df_mod$av_weight, conf_ints[,3], col = "blue", lty = 3)

Multivariable Linear Regression

library(car)
library(MASS)
library(corrplot)

# lets look at the correlations between variables
corrplot(cor(df_mod[c("av_len", "av_weight", "hfias_score", "av_muac")], method = "pearson"))

mv_length_model <- lm(av_len ~ av_weight + gender + edu + hfias_score + av_muac, data = df_mod)
summary(mv_length_model)
## 
## Call:
## lm(formula = av_len ~ av_weight + gender + edu + hfias_score + 
##     av_muac, data = df_mod)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.3677 -1.2086 -0.0165  1.1371  9.8643 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 55.095246   1.648829  33.415  < 2e-16 ***
## av_weight    5.897488   0.038395 153.602  < 2e-16 ***
## gender      -0.321244   0.074935  -4.287 1.88e-05 ***
## edu          0.006073   0.005161   1.177    0.239    
## hfias_score  0.006921   0.005239   1.321    0.187    
## av_muac     -2.203009   0.143019 -15.404  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.878 on 2617 degrees of freedom
## Multiple R-squared:  0.9063, Adjusted R-squared:  0.9061 
## F-statistic:  5062 on 5 and 2617 DF,  p-value: < 2.2e-16
## to check linear model assumptions
vif(mv_length_model) # variance inflation factor
##   av_weight      gender         edu hfias_score     av_muac 
##    1.141297    1.020360    1.005315    1.008059    1.117906
plot(mv_length_model) # plots residuals vs fitted, normal q-q, cook's distance, and std residuals

crPlots(mv_length_model, terms = ~av_weight + hfias_score + av_muac, smooth = T)

# shows the smoothed relationship of each variable