This post is a summary of what I’ve learnt about R commands in the Applied Linear Modeling course this semester.
Special thanks to Dr. Jenine Harris who taught this course, and TA Anne Trolard who taught the lab! Taking this course has been a great experience due to their efforts and talent. Also, statistics is fun & R is fun & applying statistics to public health issues is fun- so we don’t get bored.
This post will be a convenient reference for myself (and hopefully for you who are reading this post) working with R in the future. With this purpose, I am not going to write about statistical principles but simply a summary of commands.
Note: When you copy the commands in this post to R, the quotation marks “” wouldn’t be the right font sometimes – if that occurs, just delete and retype the quotation marks “”.
Table of Contents
1.Importing data 2.Create dataset in R 3.Missing values 4.Recode variables 5.Subset data 6.Descriptive statistics 7.Graphing 8.Correlation 9.Simple linear regression 10.Multiple regression 11.Visualization of regression line 12.Check assumptions for multiple linear regression 13.Outliers and influential observations 14.Logistic Regression 15.ANOVA
1. Importing data
#set a working directory
setwd(“C:/Users/path”)
install.packages(‘foreign’)
library(foreign)
data <- read.spss(“C:/Users/path/filename.sav”, to.data.frame = TRUE)
data <- read.csv(file.choose(), header=T)
data <- read.delim(‘E:/path/filename.dat’, header=T)
View(data)
summary(data)
head(data) # give you the first six rows
summary(data$variable)
#use contrasts to see how R codes categorical variables
contrasts(data$variable)
sort(data$variable)
save(data, file=”dataname.Rda”) # we can save it as a R datafile
rm(data)
load(“dataname.Rda”) #import the R datafile
2.Create dataset in R
# give values to vectors
continous_variable1 <- c(1,2,3,4,5)
categorical_variable2 <- c(“0″,”0″,”0″,”1″,”1”)
#add vectors to a new data frame called d
d <-data.frame(continous_variable1,categorical_variable2)
3.Missing values
is.na(data$variable) <- data$variable == “998”
data$variable[data$variable==99] <- NA
#deleting missing values (the observation)
data <-data[!is.na(data$variable),]
rownames(data) <- NULL
#run summary again to check: there’s no missing values
summary(titanic$Embarked)
#recoding a missing value code to NA
smoke$packyears[smoke$packyears==252]<- NA
#converting NA to something else
smoke$packyears[is.na(smoke$packyears)] <- “Never Smoked”
4.Recode variables
#collapse employment to fewer groups
smokers$newemp[smokers$employment == “1-Employed for wages”] <- “Employed for wages”
smokers$newemp[smokers$employment == “2-Self-employed”] <- “Self employed”
smokers$newemp[smokers$employment == “3-Out of work for more than 1 year”] <- “Unemployed”
smokers$newemp[smokers$employment == “4-Out of work for less than 1 year”] <- “Unemployed”
smokers$newemp[smokers$employment == “5-A homemaker”] <- “Other”
smokers$newemp[smokers$employment == “6-A student”] <- “Other”
smokers$newemp[smokers$employment == “7-Retired”] <- “Other”
smokers$newemp[smokers$employment == “8-Unable to work”] <- “Other”
smokers$newemp[smokers$employment == “9-Refused”] <- NA
#collapsing a categorical variable (see lab11)
summary(smoke$marital_status)
new.levels <- c(1,0,0,0,0,0,NA,NA)
smoke$marital_status3 <- factor(new.levels[smoke$marital_status])
summary(smoke$marital_status3)
#create categorical variable from continuous variable
data$variable_cat <- cut(data$variable, c(minvalue,cutoffpoint,maxvalue), labels=c(“label1″ ,”label2”))
#How do we check that it was categorized correctly?
table(data$variable, data$variable_cat )
#examples from lab6
#collapsing a continuous variable into a 3-categorical variable
naccho$tfollowerscat <- cut(naccho$tfollowers, seq(0,10,3), right=FALSE)
summary(naccho$tfollowerscat)
table(naccho$tfollowers, naccho$tfollowerscat)
#seq(0,10,3) cut the values between 0-10 into three categories
#right=FALSE is [), TRUE is (] – can demonstrate this using the homenetper var
naccho$homenetper
summary(naccho$homenetper)
table(naccho$homenetper)
naccho$homenet2 <- cut(naccho$homenetper, right=FALSE, c(65.2,73,76,84.7))
naccho$homenet3 <- cut(naccho$homenetper, right=TRUE, c(65.2,73,76,84.7))
#FALSE: [65.2,73) [73,76) [76,84.7)
#TRUE: (65.2,73] (73,76] (76,84.7]
#encode a vector as a factor, to create categories
#example from week 6 syntax naccho75$c5q51a <- factor(naccho75$c5q51a, levels=c(0,1), labels=c("No specialist", "specialist"))
#from help document: factor(x = character(), levels, labels = levels, exclude = NA, ordered = is.ordered(x), nmax = NA)
5.Subset data
#example from wk7 demo. create a smaller data set including just the variables of interest
#get the means and standard deviations for everything in the smaller data set
#put means and sds into a data.frame to organize
c2<-na.omit(crimestats[c(“crime”,”murder”,”pctmetro”,”pcths”,”poverty”,”single”)])
#”na.omit”: omit all the observations with missing values for the six variables
means<-sapply(c2, mean, na.rm=TRUE) # get a new vector with all the means
sds<-sapply(c2, sd, na.rm=TRUE)
data.frame(means,sds)
# unique id for DC is 51
# create a new dataset excluding DC
c3<-subset(crimestats, sid != 51)
#More info:
http://statmethods.net/management/subset.html
6.Descriptive statistics
mean(data$variable, na.rm=TRUE)
sd(data$variable, na.rm=TRUE)
#mean & sd by group:
m <- tapply(data$variable, data$group, na.rm=TRUE, mean)
m
s <- tapply(data$variable, data$group,na.rm=TRUE, sd)
s
#creating a table
tab1 <- aggregate(dv ~ iv, data, mean)
tab2 <- aggregate(cbind(v1, v2) ~ group , data, mean)
View(tab1)
View(tab2)
#chi-squared
ch <- chisq.test(data$variable1, data$variable2)
ch
#observed, expected, standardized residuals
ch$observed
ch$expected
ch$stdres
#another way to do chi-squared (see week 9 reviewcommands)
library(descr)
#Get table & chi-square
CrossTable(data$variable1,data$variable2,chisq=T)
#get expected values
CrossTable(data$variable1,data$variable2,expected=T)
#delete cell contents: Chi-square contribution,row %,column %,table %
CrossTable(data$variable1,data$variable2,chisq=T,
prop.chisq=F,prop.c=F,prop.t=F)
#one-sample t-test
t.test(smokers$ageonset, mu=18)
#independent samples t-test
t.test(smokers$ageonset~smokers$sex)
#paired samples t-test
t.test(smokers$numcigs,smokers$packyears,paired=TRUE)
#one-way ANOVA
fit <- aov(numcigs ~ employment, data=smokers)
summary(fit)
#post hoc test
TukeyHSD(fit)
#means of each group
aggregate(numcigs ~ employment,data=smokers,mean)
7.Graphing
#histogram
hist(data$variable)
#boxplot
boxplot(data$variable, xlab=”IV”,ylab=”DV”,
main=”Figure 1. Title.”,
boxwex=.3, col=”gold”)
#bar graphs need a summary table of numbers (name the table “a” here)
a<-table(data$variable)
print(a)
barplot(a)
#scatterplot for simple linear regression
plot(data$IV,data$DV)
plot(data$IV,data$DV, col=”darkred”,
main=”Figure 1. Title.”,
xlab=”…”,
ylab=”…”)
abline(lsfit(data$IV,data$DV))
#3D plot for multiple linear regression
install.packages(“scatterplot3d”)
library(scatterplot3d)
s3d <- scatterplot3d(data$IV1, data$IV2, data$DV,
highlight.3d=TRUE, main=”Figure 1. title.”)
fit <- lm(data$DV ~ data$IV1+data$IV2)
s3d$plane3d(fit, lty.box=”solid”)
#scatterplot matrix
pairs(data)
#identify outlier in scatterplot
plot(data$IV,data$DV)
text(data$IV,data$DV, labels=data$variable)
#from lab7: visualize the missing data
install.packages(“Amelia”)
library(Amelia)
titanic <- read.csv(file.choose(), header=T, na.strings=c(“”))
#na.strings ensures that missing values in string variables (or character vectors in R-speak) come in as “NA”
#If you want to change 888 into missing, you can type c(“888”)
#a cool table that visualizes your missing data–need the Amelia package for this
missmap(data, main = “Missing values vs observed”)
#visualize the logistic regression interaction: see week 8 – syntax 2
8.Correlation
cor.test(data$variable1,data$variable2)
#test whether the correlations are different for groups
install.packages(‘psych’)
library(psych)
#enter the correlation for males (.29)
#the correlation for females (.74)
#and the sample sizes for male (4) and female (24)
paired.r(.29,.74,NULL, 4, 24)
#enter the correlation for height & arm span (.79)
#the correlation for height & hand width (.53)
#the correlation for arm span & hand width (.63)
#and the sample size (28)
paired.r(.79,.53,.63, 28)
#partial correlation between two variables controlling for a third variable
install.packages(“ggm”)
library(ggm)
#get the partial correlation between
#height and hand, controlling for arm span
#assign it to an object called “hh”
hh <- pcor(c(“height”, “hand”, “armspan”), var(students))
hh
#get the amount of variation explained by
#squaring the correlation
hh^2
#test to see if hh is significantly different from 0
#use the pcor.test command with the correlation hh
#the number of variables (3) and the number of people (28)
pcor.test(hh,3,28)
9.Simple linear regression
install.packages(“car”)
install.packages(“QuantPsyc”)
library(car)
library(QuantPsyc)
#after making a scatterplot, create a simple linear regression predicting
#BMI from days of physical activity
bmimodel1<-lm(bmi~days, data=bmidata)
summary(bmimodel1)
#get some confidence intervals for the slope and intercept
confint(bmimodel1) #defaults to 95%
confint(bmimodel1, level=.99) #get the 99% CI
10.Multiple regression
#after running scatterplot3d, create a multiple regression model
model1<-lm(DV ~ IV1+ IV2, data)
summary(model1)
#start by giving the info specialist categories labels, and create a larger model
naccho75$c5q51a <- factor(naccho75$c5q51a, levels=c(0,1), labels=c(“No specialist”, “specialist”))
followers2<-lm(tfollowers ~ popmil + tweets + c5q51a, naccho75)
summary(followers2)
#compare two models
model1<-lm(tfollowers ~ popmil + tweets, naccho75)
summary(model1)
model2<-lm(tfollowers ~ popmil + tweets + homenet, naccho75)
summary(model2)
anova(model1, model2)
11.Visualization of regression line
*Note to self: the syntax is from my challenge 1 assignment.
install.packages(‘visreg’)
library(visreg)
#graph the relationship between popmil & tfollowers with 3 levels of twitadopt
par(mar=c(5,5,5,2))
visreg(model3, “popmil”, by=”twitadopt”, overlay=TRUE,band=FALSE,legend=FALSE,
#deleted this line: line=list(col=c(“green”,”orange”,”red”)),
points.par=list(cex=0.8),
ylab=”Twitter followers (in hundreds)”, xlab=”Population (in millions)”,
main=”Figure 3. Twitter followers by\npopulation in 75 health departments.”)
#use contrasts to see how R codes categorical variables, so I can add legend
contrasts(NACCHO$twitadopt)
legend(“bottomright”, levels(NACCHO$twitadopt),
#change this line: col=c(“green”,”orange”,”red”),
col=c(“red”,”green”,”blue”),
legend=c(“early”,”mid”,”late”), pch=15)
12.Check assumptions for multiple linear regression
syntax: see week7
#Checking multicollinearity assumption
#checking multicollinearity assumption
#Durbin-Watson test
#Linearity test
# Normality of Residuals
#Evaluate homoscedasticity
13. Outliers and influential observations
syntax: see week7
#Step 1:
#make a new data set without missing values
#Step 2:
#leverage
#cook’s distance
#studentized deleted residuals
#standardized residuals
#covariance ratio
14. Logistic Regression
library(car)
library(mlogit)
library(gmodels)
#check coding on the “use” outcome variable
table(condom$use) # it’s binary
#you’ll get the output below:
#Condom Used Unprotected
#43 57
#re-order categories on the “use” outcome variable
condom$use <- factor(condom$use, levels=c(“Unprotected”,”Condom Used”))
#check reorder on the “use” outcome variable
table(condom$use)
#get output:
#Unprotected Condom Used
#57 43
#So now your model is predicting the probability of comdom use
#logistic regression model
model1<-glm(DV ~ IV1+IV2+IV3+…+IVn, data=dataname, family=binomial())
summary(model1)
#odds ratios & confidence interval for model1
exp(cbind(OR = coef(model1), confint(model1)))
#computing model chi-squared
modelChi <- model1$null.deviance – model1$deviance
chidf <- model1$df.null – model1$df.residual
chisq.prob <- 1 – pchisq(modelChi, chidf)
modelChi; chidf; chisq.prob
#classification table for percent correctly predicted
#if p>.5, it counts as “condom use”;
#if p<.5, it counts as “no condom use”.
#needs the descr library for the CrossTable command
install.packages(‘descr’)
library(descr)
p <- predict(condommodel.1, newdata=condom, type=”response”)
condom$pred <- factor(1*(p >= .5), labels=c(“No”, “Yes”))
CrossTable(condom$use,condom$pred, expected=F,
prop.chisq=F, sresid=F, prop.r=T,
prop.c=F, prop.t=F)
#compare model 1 and model 2
modelLRchisq <- model1$deviance – model2$deviance
LRdf <- model1$df.residual – model2$df.residual
chisq.prob <- 1 – pchisq(modelLRchisq, LRdf)
modelLRchisq; LRdf; chisq.prob
#interaction
condommodel.3<-glm(use ~ perceive + gender + perceive*gender, data=condom,
family=binomial)
summary(condommodel.3)
#For the commands regarding the assumptions for logistic regression, see week8 syntax 3
#Test 3 assumptions in logistic
#(1)Test linearity
#(2)Test Independence of Errors
#(3)Test multicollinearity
#Identify outliers: the same with linear regression.
#significance for model
with(model1, null.deviance – deviance)
with(model1, df.null – df.residual)
with(model1, pchisq(null.deviance – deviance, df.null – df.residual, lower.tail = FALSE))
#compare big model to small model use LR test
c <- modelsmall$deviance – modelbig$deviance
d <- modelsmall$df.residual – modelbig$df.residual
pchisq(c,d, lower.tail=F)
#percent correctly predicted
#more sensitive (1s) or specific (0s)
p <- predict(modelbig, newdata=dataname, type=”response”)
dataname$pred <- factor(1*(p >= .5))
CrossTable(dataname$DV, mydata$pred, expected=F,
prop.chisq = F, sresid=F, prop.r=T,
prop.c=F, prop.t=F)
15.ANOVA
#See week 10 syntax
#one-way ANOVA for mean BMI by gravidity
library(stats)
bmigrav <- aov(prolapse$BMI ~ prolapse$gravidity_cat)
summary(bmigrav) # reject the null hypothesis, means are not equal
print(bmigrav)
#Check assumptions
#post-hoc
#two-way ANOVA