Chapter 4 Data cleaning
4.1 What did we do?
If you don’t want to look at all the lengthy work, Kavya and Zuofu are excited to talk through the process with you!
library(readxl)
library(dplyr)
library(ggplot2)
library(rjags)
library(forcats)
library(gridExtra)
library(shiny)
library(data.table)
library(knitr)
4.1.1 Load data, select and rename columns
If you want to reproduce our work, make sure to change your file path :)
myData1617 <- read.csv("/Users/kevinhuang/Desktop/Past Academics/Spring 2019/MATH 454/454 Project/CollegeScorecard_Raw_Data/MERGED2016_17_PP.csv")
myData1617_sub <- myData1617 %>%
select(INSTNM,REGION,LOCALE,UGDS,ADM_RATE,ACTCMMID,SAT_AVG,UGDS_WHITE,COSTT4_A)
myData1516 <- read.csv(file = "/Users/kevinhuang/Desktop/Past Academics/Spring 2019/MATH 454/454 Project/CollegeScorecard_Raw_Data/MERGED2015_16_PP.csv")
myData1516_sub <- myData1516 %>%
select(INSTNM,UGDS,ADM_RATE,ACTCMMID,SAT_AVG,UGDS_WHITE,COSTT4_A)
myData1415 <- read.csv(file = "/Users/kevinhuang/Desktop/Past Academics/Spring 2019/MATH 454/454 Project/CollegeScorecard_Raw_Data/MERGED2014_15_PP.csv")
myData1415_sub <- myData1415 %>%
select(INSTNM,UGDS,ADM_RATE,ACTCMMID,SAT_AVG,UGDS_WHITE,COSTT4_A)
4.1.2 Filter schools that have a rank
college <- rep(NA,149)
for (i in 1:149){
college[i] = newLiberalArts[i,]$CollegeName
}
university <- rep(NA,120)
for (i in 1:120){
university[i] = newUniversity[i,]$UniversityName
}
schools <- c(college, university)
# Run tests to make sure that datasets are consistent:
check_1_Data1617 <- myData1617_sub %>%
filter(INSTNM %in% college)
dim(check_1_Data1617)
check_3_Data1415 <- myData1415_sub %>%
filter(INSTNM %in% college)
dim(check_3_Data1415)
check_2_Data1516 <- myData1516_sub %>%
filter(INSTNM %in% college)
dim(check_2_Data1516)
# Identify class of each variable, in preparation for transformations.
class(myData1617_sub$UGDS_1617)
class(myData1617_sub$ADM_RATE_1617)
class(myData1617_sub$ACTCMMID_1617)
class(myData1617_sub$SAT_AVG_1617)
class(myData1617_sub$UGDS_WHITE_1617)
class(myData1617_sub$COSTT4_A_1617)
class(myData1415_sub$INSTNM)
myData1415_sub_characterCollege <- myData1415_sub %>%
filter(INSTNM %in% college) %>%
mutate(INSTNM = as.character(INSTNM)) %>%
mutate(UGDS_1415 = as.numeric(as.character(UGDS_1415))) %>%
mutate(UGDS_WHITE_1415 = as.numeric(as.character(UGDS_WHITE_1415))) %>%
mutate(SAT_AVG_1415 = as.numeric(as.character(SAT_AVG_1415))) %>%
mutate(ACTCMMID_1415 = as.numeric(as.character(ACTCMMID_1415))) %>%
mutate(ADM_RATE_1415 = as.numeric(as.character(ADM_RATE_1415))) %>%
mutate(COSTT4_A_1415 = as.numeric(as.character(COSTT4_A_1415)))
myData1516_sub_characterCollege <- myData1516_sub %>%
filter(INSTNM %in% college) %>%
mutate(INSTNM = as.character(INSTNM)) %>%
mutate(UGDS_1516 = as.numeric(as.character(UGDS_1516))) %>%
mutate(UGDS_WHITE_1516 = as.numeric(as.character(UGDS_WHITE_1516))) %>%
mutate(SAT_AVG_1516 = as.numeric(as.character(SAT_AVG_1516))) %>%
mutate(ACTCMMID_1516 = as.numeric(as.character(ACTCMMID_1516))) %>%
mutate(ADM_RATE_1516 = as.numeric(as.character(ADM_RATE_1516))) %>%
mutate(COSTT4_A_1516 = as.numeric(as.character(COSTT4_A_1516)))
myData1617_sub_characterCollege <- myData1617_sub %>%
filter(INSTNM %in% college) %>%
mutate(INSTNM = as.character(INSTNM)) %>%
mutate(UGDS_1617 = as.numeric(as.character(UGDS_1617))) %>%
mutate(UGDS_WHITE_1617 = as.numeric(as.character(UGDS_WHITE_1617))) %>%
mutate(SAT_AVG_1617 = as.numeric(as.character(SAT_AVG_1617))) %>%
mutate(ACTCMMID_1617 = as.numeric(as.character(ACTCMMID_1617))) %>%
mutate(ADM_RATE_1617 = as.numeric(as.character(ADM_RATE_1617))) %>%
mutate(COSTT4_A_1617 = as.numeric(as.character(COSTT4_A_1617)))
4.1.3 Join datasets
fullLiberalArts <- newLiberalArts %>%
full_join(myData1415_sub_characterCollege, by = c("CollegeName" = "INSTNM")) %>%
full_join(myData1516_sub_characterCollege, by = c("CollegeName" = "INSTNM")) %>%
full_join(myData1617_sub_characterCollege, by = c("CollegeName" = "INSTNM"))
dim(fullLiberalArts)
## [1] 175 30
full_LiberalArts <- fullLiberalArts[!duplicated(fullLiberalArts$CollegeName), ]
dim(full_LiberalArts) # After we removed the repetitive entries.
## [1] 149 30
myData1415_sub_characterUniversity <- myData1415_sub %>%
filter(INSTNM %in% university) %>%
mutate(INSTNM = as.character(INSTNM)) %>%
mutate(UGDS_1415 = as.numeric(as.character(UGDS_1415))) %>%
mutate(UGDS_WHITE_1415 = as.numeric(as.character(UGDS_WHITE_1415))) %>%
mutate(SAT_AVG_1415 = as.numeric(as.character(SAT_AVG_1415))) %>%
mutate(ACTCMMID_1415 = as.numeric(as.character(ACTCMMID_1415))) %>%
mutate(ADM_RATE_1415 = as.numeric(as.character(ADM_RATE_1415))) %>%
mutate(COSTT4_A_1415 = as.numeric(as.character(COSTT4_A_1415)))
myData1516_sub_characterUniversity <- myData1516_sub %>%
filter(INSTNM %in% university) %>%
mutate(INSTNM = as.character(INSTNM)) %>%
mutate(UGDS_1516 = as.numeric(as.character(UGDS_1516))) %>%
mutate(UGDS_WHITE_1516 = as.numeric(as.character(UGDS_WHITE_1516))) %>%
mutate(SAT_AVG_1516 = as.numeric(as.character(SAT_AVG_1516))) %>%
mutate(ACTCMMID_1516 = as.numeric(as.character(ACTCMMID_1516))) %>%
mutate(ADM_RATE_1516 = as.numeric(as.character(ADM_RATE_1516))) %>%
mutate(COSTT4_A_1516 = as.numeric(as.character(COSTT4_A_1516)))
myData1617_sub_characterUniversity <- myData1617_sub %>%
filter(INSTNM %in% university) %>%
mutate(INSTNM = as.character(INSTNM)) %>%
mutate(UGDS_1617 = as.numeric(as.character(UGDS_1617))) %>%
mutate(UGDS_WHITE_1617 = as.numeric(as.character(UGDS_WHITE_1617))) %>%
mutate(SAT_AVG_1617 = as.numeric(as.character(SAT_AVG_1617))) %>%
mutate(ACTCMMID_1617 = as.numeric(as.character(ACTCMMID_1617))) %>%
mutate(ADM_RATE_1617 = as.numeric(as.character(ADM_RATE_1617))) %>%
mutate(COSTT4_A_1617 = as.numeric(as.character(COSTT4_A_1617)))
fullUniversity <- newUniversity %>%
full_join(myData1415_sub_characterUniversity, by = c("UniversityName" = "INSTNM")) %>%
full_join(myData1516_sub_characterUniversity, by = c("UniversityName" = "INSTNM")) %>%
full_join(myData1617_sub_characterUniversity, by = c("UniversityName" = "INSTNM"))
dim(fullUniversity)
## [1] 120 30