My R manual (work in progress)
https://drive.google.com/file/d/1CBN0usv0-UBNexJGhqXSrnoSw9bL2F4Z/view?usp=sharing
Excel reading
library(readxl)
main <- read_excel("public high schools CCD.xlsx",sheet="PSM")
arc <- read_excel("public high schools CCD.xlsx",sheet="ARC")
String manipulation
https://www.r-bloggers.com/basic-text-string-functions-in-r/
#get a county name from column agency in main
main$pos = regexpr('COUNTY', main$agency)
main$county = substr(main$agency,1,main$pos-1)
Keep when a variable's value is X
GrowthMindset<-filter(ff,var_name2=="% Vali" & var_name3=="")
Grep (Keep rows when a variable includes a certain string)
census <- read.csv(file = 'DP02.csv')
WV<- census[grep("West Virginia", census$GEONAME), ]
Histogram
hist(treat$grandtotal,breaks=100)
Paste
abc="this"
def="pen"
paste(abc, "is a", def)
Missing values
ref_2018c %>%
mutate(value_missing=case_when(
AuditDate_68 >= -999 ~ 0,
is.na(AuditDate_68) ~ 1,
)) ->ref_2018c2
Reading from a CSV file and get a freq on a string variable.
setwd("C:/Users/….")
temp <- read.csv(file="filenamehere.csv",header=TRUE,sep=",")
temp2<-as.data.frame(table(temp$Exit.Reason))
Function
kaz_macro1<-function(var1){
var1 %>%
mutate(GrowthMindset = (q0008_0001+q0008_0002+q0008_0003+q0008_0004+q0008_0005+q0008_0006+q0008_0007+q0008_0008)/8) %>%
mutate(SelfEfficacy = (q0009_0001+q0009_0002+q0009_0003+q0009_0004+q0009_0005)/5) %>%
mutate(MSelfEfficacy = (q0010_0001+q0010_0002+q0010_0003+q0010_0004+q0010_0005+q0010_0006+q0010_0007)/7) %>%
mutate(MathAnxiety = (q0011_0001+q0011_0002+q0011_0003+q0011_0004+q0011_0005+q0011_0006)/6) %>%
mutate(TeacherUse = (q0012_0001+q0012_0002+q0012_0003+q0012_0004+q0013_0001+q0013_0002+q0013_0003+q0013_0004)/8) -> var1
subset(var1,select=c(flag,dataID,commonID,treat,GrowthMindset,SelfEfficacy,MSelfEfficacy,MathAnxiety,TeacherUse))
}
wholedata2 <- kaz_macro1(wholedata)
data_Male2 <- kaz_macro1(data_Male)
In SAS
%let var1=gender;
In R
var1<-gender
Function
this <-function(x){
x*24
}
this(5)
Works like IF statements
wholedata %>%
mutate(gender=case_when(
q0014==1 ~"Male",
q0014==2 ~"Female"
)) ->wholedata_a
wholedata %>%
mutate(gender=case_when(
q0014==1 ~"Male",
q0014==2 ~"Female")) %>%
mutate(race=case_when(
q0015==1 ~"White",
q0015==2 ~"non White"
)) ->wholedata_a
#all variable names will become lowercases
var.names<-tolower(colnames(data_name))
colnames(data_name)<-var.names
Simple recoding:
Fixed date
https://www.stat.berkeley.edu/~s133/dates.html
wholedata$year1start <- as.Date('2012-8-1')
Calendar date variable in R
https://www.r-bloggers.com/date-formats-in-r/
Example. This is for a date variable coming from an Excel file (which sets the origin date to 1899-12-30).
wholedata$ReferralDate_1001_SAS <- as.Date(wholedata$ReferralDate_1001,origin = "1899-12-30")
R instruction by matloff
https://github.com/matloff/fasteR
Function (Thanks, Aisaku)
mydata = cars
colnames(mydata)
proc_means = function(var){
x = list(summary= summary(var),
sd = sd(var),
N = length(var),
number_of_NA = sum(is.na(var)))
return(x)
}
proc_means(mydata$speed,)
proc_means(mydata$dist)
describe function (from psych package)
https://personality-project.org/r/html/describe.html
Getting a mean when the values include missing values
mean(temp2$mem1,na.rm=TRUE)
Recoding using SQLDF
temp2<-sqldf("
SELECT team_mem_1,
CASE WHEN team_mem_1='No' then 0 WHEN team_mem_1='' then NULL ELSE 1 END AS mem1,
CASE WHEN team_mem_2='No' then 0 WHEN team_mem_2='' then NULL ELSE 1 END AS mem2,
CASE WHEN team_mem_3='No' then 0 WHEN team_mem_3='' then NULL ELSE 1 END AS mem3,
CASE WHEN team_mem_4='No' then 0 WHEN team_mem_4='' then NULL ELSE 1 END AS mem4,
CASE WHEN team_mem_5='No' then 0 WHEN team_mem_5='' then NULL ELSE 1 END AS mem5,
CASE WHEN team_mem_6='No' then 0 WHEN team_mem_6='' then NULL ELSE 1 END AS mem6,
CASE WHEN team_mem_7='No' then 0 WHEN team_mem_7='' then NULL ELSE 1 END AS mem7,
CASE WHEN team_mem_8='No' then 0 WHEN team_mem_8='' then NULL ELSE 1 END AS mem8,
CASE WHEN team_mem_9='No' then 0 WHEN team_mem_9='' then NULL ELSE 1 END AS mem9,
CASE WHEN team_mem_10='No' then 0 WHEN team_mem_10='' then NULL ELSE 1 END AS mem10,
CASE WHEN team_mem_11='No' then 0 WHEN team_mem_11='' then NULL ELSE 1 END AS mem11,
FROM temp1;
")
SQL
sqldf("SELECT DIV_NAME, SUM(DIV_NUM) as X FROM t1
GROUP BY DIV_NAME
ORDER BY X DESC
LIMIT 3;
" )
Saving data
temp2<-sqldf("SELECT DIV_NAME, SUM(DIV_NUM) as X FROM t1
GROUP BY DIV_NAME
ORDER BY X DESC
LIMIT 3;
" )
Indexing
deck[1:10,]
Lists
list1<-list(100:130, "R",list(TRUE,FALSE))
list1
Functions
roll <-function(){
die <-1:6
dice <-sample(die,size=2,replace=TRUE)
sum(dice)
dice
}
roll()
roll <-function(die){
dice <-sample(die,size=2,replace=TRUE)
sum(dice)
dice
}
roll(die <-1:6)
library(ggplot2)
roll <-function(){
die <-1:6
dice <-sample(die,size=2,replace=TRUE,
prob=c(1/8, 1/8, 1/8, 1/8, 1/8, 3/8))
sum(dice)
}
rolls<-replicate(100000,roll())
qplot(rolls,binwidth=1)
#sorting data
attach(t1)
t1 <- t1[order(SECTION_COURSES_ID),]
If statement
if (x == 1){print("x is 1")} else if {print("x is not 1!")}
RECODING
From a text to a text
t1 <- t1 %>% mutate(course_type = case_when(
LOCAL_COURSE_TITLE == '22 Algebra I Hon, YR' ~ 'ALG 1',
LOCAL_COURSE_TITLE == '22 Math 07, YR' ~ 'Pre ALG',
))
From a text to a numeric
t1 <- t1 %>% mutate(treat = case_when(
LICENSE_NUM == 'AA1' ~ 1,
LICENSE_NUM == 'AA5' ~ 1
))
SQL in R
library(sqldf)
sqldf('SELECT data_from, dropout,FINALGPA, FINALGPA+2 FROM parkrose1 WHERE dropout = 1 ORDER BY FINALGPA ASC')
# Add new character variable agecat to abaloneMod
abaloneMod <- abaloneMod %>%
mutate(agecat = ifelse(test = age < 10.5,
yes = "< 10.5",
no = "10.5 and older"))
#create two samples off the data
set.seed(567)
index_random <-sample(1:nrow(coh2both),2/3*nrow(coh2both))
training_set<-coh2both[index_random,]
test_set<-coh2both[-index_random,]
#create a categorical variable
coh2both$gpa_cat<-rep(NA,length(coh2both$GPA_))
coh2both$gpa_cat[which(coh2both$GPA_ <=2)]<-"less than 2"
coh2both$gpa_cat[which(coh2both$GPA_ > 2)]<-"gt than 1"
Replace a missing value with a median
median_gpa=median(coh2both$GPA_,na.rm=TRUE)
temp1<-coh2both
temp1$GPA_[na_index]<-median_gpa
summary(temp1$GPA_)
mean(temp1$GPA_)
Remove a variable
x$treat <-NULL
Subsetting data
select_treat<-which(coh2both2$treat == 1)
comparison <- coh2both2[-select_treat, ]
treat <- coh2both2[+select_treat, ]
na_index<-which(is.na(coh2both2$GPA_))
result<-coh2both2[-na_index,]
Zscore
zscore<--scale(coh2both2$post_reach15)
mean(zscore)
var(zscore)
Chi-square test
table(coh2both2$district,coh2both2$treat)
#create a change score
coh2both %>%
mutate(growth=post_reach15-pre_reach15 ) ->coh2both2
plot(coh2both2$growth,coh2both2$GPA_,xlab="change",ylab="gpa")summary(coh2both2.list)
mean(coh2both2$pre_reach15 <2 )
table(coh2both2$district)
table(coh2both2$district,coh2both2$treat)
outlier detection
index_out<-which(coh2both$GPA_ > 1)
x<-coh2both[-index_out,]
Bivariate Plot
plot(coh2both$pre_reach15,coh2both$post_reach15)
Create a variable
#create a change score
coh2both %>%
mutate(growth=post_reach15-pre_reach15 ) ->coh2both2
String manipulation
library(stringr)
str_trim(" this is a test")
Calendar variables
# Preview students2 with str()
str(students2)
# Load the lubridate package
library(lubridate)
# Parse as date
dmy("17 Sep 2015")
# Parse as date and time (with no seconds!)
mdy_hm("July 15, 2012 12:56")
# Coerce dob to a date (with no time)
students2$dob <- ymd(students2$dob)
Change the variable type
https://campus.datacamp.com/courses/cleaning-data-in-r/1828?ex=3
# Preview students with str()
str(students)
# Coerce Grades to character
students$Grades <- as.character(students$Grades)
# Coerce Medu to factor
students$Medu <- as.factor(students$Medu)
# Coerce Fedu to factor
students$Fedu <- as.factor(students$Fedu)
Dates with lubridate
https://campus.datacamp.com/courses/cleaning-data-in-r/1828?ex=1
library(lubridate)
The use of na.rm=TRUE
mean(x,na.rm=TRUE)
This above is an example of how telling R not to be bothered by a missing value. Without the na.rm specificaiton, the function will not return a value.
Combining two variables into one.
psmdata$cohort_t_status<-paste0(psmdata$cohortid,"-",psmdata$treat)
by_cohort <- psmdata %>%
group_by(cohort_t_status) %>%
summarize(meanACT=mean(act_composite))
ggplot(by_cohort,aes(x=cohort_t_status,y=meanACT))+geom_col()
Attach() and detach()
http://www.statmethods.net/management/aggregate.html
How to convert SAS and other data files into R files:
http://www.ats.ucla.edu/stat/r/faq/inputdata_R.htm
SQL in R
sqldf
Function example:
addition = function(num1,num2){ answer = num1+num2 return(answer) } addition(10,9) addition(5,4)
Change working directly (Notice the slash is / not \ even on Windows)
setwd("C:/R")
You can check the current working directly by submitting:
getwd()
Read this with more details by clicking here:
Convert a CSV file into a readable dataset
temp <- read.csv(file="practice_data.csv",header=TRUE,sep=",")
type the name of the dataset to see what you just did (if big, you will see a lot of data, though not all):
temp
This would do the same thing:
temp = read.csv(file="practice_data.csv",header=TRUE,sep=",")
You can check what datasets you have activated by:
objects()
Print a variable off a dataset
temp2 <- temp$height
Quick look at the data
str(temp) <This descrives the structure of data.>
head(temp) <This prints the first 6 observations.>
Get the descriptive summary of the data or variables
summary(temp)
summary(temp$weight)
mean(BOD$demand) <BOD is a default dataset.>
hist(BOD$demand) <Histogram is created.>
hist(BOD$demand,breaks=4) <The bar breask every 4 units>
boxplot(BOD$demand)
Create new variables
newvar=BOD$demand*2
newvar
Misllaneous comands
library() You can tell which packages you downloaded
search() This does something similar.
data() Show available default datasets
if you type the name of the data, you can see it. For example:
BOD
Reference
https://sites.google.com/site/webtextofr/ (Japanese)
Algebra and R http://atcm.mathandtech.org/EP2008/papers_full/2412008_14997.pdf