#install.packages("tidyverse")
#install.packages("lubridate")
library(tidyverse)
library(lubridate)Part 3: Working with large-scale data
Intro to Computational Studies in Education and the Social Sciences
Case Study: HBCUs and Educational Geography
In this section, we analyze data on Historically Black Colleges and Universities (HBCUs) to explore patterns in location, institutional type, and founding history.
Step 1: Load Packages
We’ll start with a guided example using data on Historically Black Colleges and Universities.
Step 2: Load the data
library(tidyverse)
hbcu <- read_csv("https://raw.githubusercontent.com/quant-shop/intro-comp-educ-soc/refs/heads/main/data/hbcu_data.csv")
# View the first few rows
head(hbcu)# A tibble: 6 × 7
name city state founded lat lon type
<chr> <chr> <chr> <dbl> <dbl> <dbl> <chr>
1 Alabama A&M University Normal AL 1875 34.8 -86.6 Public, 4 Year
2 Alabama State University Montgomery AL 1867 32.4 -86.3 Public, 4 Year
3 Albany State University Albany GA 1903 31.6 -84.2 Public, 4 Year
4 Alcorn State University Lorman MS 1871 31.9 -91.1 Public, 4 Year
5 Allen University Columbia SC 1870 34.0 -81.0 Private, 4 Year
6 American Baptist College Nashville TN 1924 36.2 -86.8 Private, 4 Year
# Check structure
glimpse(hbcu)Rows: 102
Columns: 7
$ name <chr> "Alabama A&M University", "Alabama State University", "Albany …
$ city <chr> "Normal", "Montgomery", "Albany", "Lorman", "Columbia", "Nashv…
$ state <chr> "AL", "AL", "GA", "MS", "SC", "TN", "AR", "SC", "NC", "FL", "A…
$ founded <dbl> 1875, 1867, 1903, 1871, 1870, 1924, 1884, 1870, 1873, 1904, 19…
$ lat <dbl> 34.7834, 32.3643, 31.5785, 31.8769, 34.0298, 36.1659, 34.7465,…
$ lon <dbl> -86.5683, -86.2952, -84.1543, -91.1458, -81.0115, -86.7844, -9…
$ type <chr> "Public, 4 Year", "Public, 4 Year", "Public, 4 Year", "Public,…
# Summary statistics
summary(hbcu) name city state founded
Length:102 Length:102 Length:102 Min. :1837
Class :character Class :character Class :character 1st Qu.:1870
Mode :character Mode :character Mode :character Median :1886
Mean :1895
3rd Qu.:1905
Max. :1988
lat lon type
Min. :18.34 Min. :-98.50 Length:102
1st Qu.:32.48 1st Qu.:-90.13 Class :character
Median :34.02 Median :-84.64 Mode :character
Mean :34.31 Mean :-85.13
3rd Qu.:36.17 3rd Qu.:-80.78
Max. :39.93 Max. :-64.96
Step 3: Clean the data
hbcu <- hbcu %>%
mutate(
founded = as.numeric(founded),
type = as.factor(type),
state = as.factor(state)
)Step 4: Basic Exploration
How many institutions are in the data set?
nrow(hbcu)[1] 102
What is the distribution by HBCU type?
hbcu %>%
count(type) %>%
arrange(desc(n))# A tibble: 5 × 2
type n
<fct> <int>
1 Private, 4 Year 45
2 Public, 4 Year 40
3 Public, 2 Year 11
4 Private, Specialized 4
5 Private, 2 Year 2
Step 5: Historical Analysis
When were the HBCUs founded?
hbcu %>%
ggplot(aes(x = founded)) +
geom_histogram(binwidth = 10, fill = "steelblue", color = "white") +
labs(
title = "Founding Years of HBCUs",
x = "Year Founded",
y = "Number of Institutions"
)
Step 6: Geographic Distribution
HBCUs by state
hbcu %>%
count(state, sort = TRUE)# A tibble: 21 × 2
state n
<fct> <int>
1 AL 14
2 GA 10
3 NC 10
4 TX 9
5 SC 8
6 MS 7
7 LA 6
8 TN 6
9 AR 4
10 FL 4
# ℹ 11 more rows
Basic visualization of HBCUs by state
hbcu %>%
count(state) %>%
ggplot(aes(x = reorder(state, n), y = n)) +
geom_col(fill = "darkgreen") +
coord_flip() +
labs(
title = "Number of HBCUs by State",
x = "State",
y = "Count"
)
Step 7: Mapping
#install.packages("maps", repos = "http://cran.us.r-project.org")
library(maps)hbcu %>%
ggplot(aes(x = lon, y = lat)) +
borders("state") +
geom_point(color = "red", alpha = 0.6) +
coord_fixed(1.3) +
labs(
title = "Geographic Distribution of HBCUs in the U.S.",
x = "Longitude",
y = "Latitude"
)
Case Study: Data on State School Districts
Step 1: Load the libraries
library(dplyr)
library(readr)
library(stringr)
library(janitor)Step 2: Load the data
district <- read.csv("https://raw.githubusercontent.com/quant-shop/intro-comp-educ-soc/refs/heads/main/data/state-district-data.csv")
# district <- read.csv("../data/state-district-data.csv")
str(district)'data.frame': 57 obs. of 36 variables:
$ State.Name : chr "ALABAMA" "ALASKA" "AMERICAN SAMOA" "ARIZONA" ...
$ State.Abbr : chr "AL " "AK " "AS " "AZ " ...
$ ANSI.FIPS.State.Code..State..Latest.available.year : int 1 2 60 4 5 59 6 8 9 10 ...
$ Total.Number.Operational.School.Districts..District..2023.24 : int 156 54 1 685 304 174 2108 272 208 45 ...
$ Total.Number.Operational.Schools..Public.School..2023.24 : int 1524 494 29 2427 1101 174 10326 1908 1010 234 ...
$ Total.Number.Operational.Charter.Schools..Public.School..2023.24 : chr "18" "31" "†" "571" ...
$ Prekindergarten.Students..State..2023.24 : chr "21924" "3320" "–" "20810" ...
$ Kindergarten.Students..State..2023.24 : chr "55881" "9257" "–" "72320" ...
$ Grades.1.8.Students..State..2023.24 : chr "447317" "78890" "–" "654843" ...
$ Grades.9.12.Students..State..2023.24 : chr "223528" "39776" "–" "369124" ...
$ Grade.13.Students..State..2023.24 : chr "†" "†" "†" "†" ...
$ Ungraded.Students..State..2023.24 : chr "†" "†" "†" "533" ...
$ American.Indian.Alaska.Native.Students..State..2023.24 : chr "5997" "28284" "–" "46878" ...
$ Asian.or.Asian.Pacific.Islander.Students..State..2023.24 : chr "11016" "6458" "–" "35975" ...
$ Hispanic.Students..State..2023.24 : chr "84196" "10076" "–" "539087" ...
$ Black.or.African.American.Students..State..2023.24 : chr "236304" "2955" "–" "64938" ...
$ White.Students..State..2023.24 : chr "381550" "61502" "–" "378320" ...
$ Nat..Hawaiian.or.Other.Pacific.Isl..Students..State..2023.24 : chr "831" "4148" "–" "4017" ...
$ Two.or.More.Races.Students..State..2023.24 : chr "28756" "17820" "–" "48415" ...
$ Total.Enrollment..Exclude.AE..for.SY.2014.15.onward..State..2023.24: chr "748650" "131243" "–" "1117630" ...
$ Full.Time.Equivalent..FTE..Teachers..State..2023.24 : chr "42857.92" "7221.23" "–" "49260.47" ...
$ Pupil.Teacher.Ratio..State..2023.24 : chr "17.47" "18.17" "–" "22.69" ...
$ State.Name..State..2023.24 : chr "ALABAMA" "ALASKA" "AMERICAN SAMOA" "ARIZONA" ...
$ Total.Number.of.Districts.with.Enrollment..District..2023.24 : int 150 54 0 664 260 174 1984 185 201 42 ...
$ Total.Number.of.School.Districts..District..2023.24 : int 159 54 1 722 308 174 2113 272 210 45 ...
$ Total.Number.of.Public.Schools..Public.School..2023.24 : int 1527 498 29 2557 1106 174 10349 1908 1018 235 ...
$ Male.Students..State..2023.24 : chr "384500" "67646" "†" "569863" ...
$ Female.Students..State..2023.24 : chr "364150" "63597" "†" "547767" ...
$ Black.or.African.American...male..State..2023.24 : chr "120605" "1488" "–" "33191" ...
$ Black.or.African.American...female..State..2023.24 : chr "115699" "1467" "–" "31747" ...
$ White...male..State..2023.24 : chr "197384" "31786" "–" "194292" ...
$ White...female..State..2023.24 : chr "184166" "29716" "–" "184028" ...
$ Grade.12.Students...Black.or.African.American..State..2023.24 : chr "15413" "241" "–" "5876" ...
$ Secondary.Teachers..State..2023.24 : chr "20188.71" "3673.47" "–" "15996.72" ...
$ Elementary.Teachers..State..2023.24 : chr "18395.46" "2985.09" "–" "30478.09" ...
$ Full.Time.Equivalent..FTE..Staff..State..2023.24 : chr "79963.90" "16423.20" "–" "109199.43" ...
Step 3: Clean the data
district_clean <- district %>%
# Fix missing/suppressed values
mutate(across(where(is.character), ~na_if(., "–"))) %>%
mutate(across(where(is.character), ~na_if(., "†"))) %>%
mutate(across(where(is.character), str_trim)) %>%
# Convert numeric columns safely
mutate(across(
where(is.character),
~parse_number(.)
))Step 4: Feature engineering
We can create variables that answer educational questions.
str(district_clean)'data.frame': 57 obs. of 36 variables:
$ State.Name : num NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "problems")= tibble [57 × 4] (S3: tbl_df/tbl/data.frame)
.. ..$ row : int [1:57] 1 2 3 4 5 6 7 8 9 10 ...
.. ..$ col : int [1:57] NA NA NA NA NA NA NA NA NA NA ...
.. ..$ expected: chr [1:57] "a number" "a number" "a number" "a number" ...
.. ..$ actual : chr [1:57] "ALABAMA" "ALASKA" "AMERICAN SAMOA" "ARIZONA" ...
$ State.Abbr : num NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "problems")= tibble [57 × 4] (S3: tbl_df/tbl/data.frame)
.. ..$ row : int [1:57] 1 2 3 4 5 6 7 8 9 10 ...
.. ..$ col : int [1:57] NA NA NA NA NA NA NA NA NA NA ...
.. ..$ expected: chr [1:57] "a number" "a number" "a number" "a number" ...
.. ..$ actual : chr [1:57] "AL" "AK" "AS" "AZ" ...
$ ANSI.FIPS.State.Code..State..Latest.available.year : int 1 2 60 4 5 59 6 8 9 10 ...
$ Total.Number.Operational.School.Districts..District..2023.24 : int 156 54 1 685 304 174 2108 272 208 45 ...
$ Total.Number.Operational.Schools..Public.School..2023.24 : int 1524 494 29 2427 1101 174 10326 1908 1010 234 ...
$ Total.Number.Operational.Charter.Schools..Public.School..2023.24 : num 18 31 NA 571 105 NA 1280 261 21 23 ...
$ Prekindergarten.Students..State..2023.24 : num 21924 3320 NA 20810 10102 ...
$ Kindergarten.Students..State..2023.24 : num 55881 9257 NA 72320 34776 ...
$ Grades.1.8.Students..State..2023.24 : num 447317 78890 NA 654843 290034 ...
$ Grades.9.12.Students..State..2023.24 : num 223528 39776 NA 369124 149892 ...
$ Grade.13.Students..State..2023.24 : num NA NA NA NA NA NA NA NA NA NA ...
$ Ungraded.Students..State..2023.24 : num NA NA NA 533 174 NA NA NA NA 0 ...
$ American.Indian.Alaska.Native.Students..State..2023.24 : num 5997 28284 NA 46878 2789 ...
$ Asian.or.Asian.Pacific.Islander.Students..State..2023.24 : num 11016 6458 NA 35975 8858 ...
$ Hispanic.Students..State..2023.24 : num 84196 10076 NA 539087 70888 ...
$ Black.or.African.American.Students..State..2023.24 : num 236304 2955 NA 64938 94324 ...
$ White.Students..State..2023.24 : num 381550 61502 NA 378320 280483 ...
$ Nat..Hawaiian.or.Other.Pacific.Isl..Students..State..2023.24 : num 831 4148 NA 4017 4960 ...
$ Two.or.More.Races.Students..State..2023.24 : num 28756 17820 NA 48415 22666 ...
$ Total.Enrollment..Exclude.AE..for.SY.2014.15.onward..State..2023.24: num 748650 131243 NA 1117630 484978 ...
$ Full.Time.Equivalent..FTE..Teachers..State..2023.24 : num 42858 7221 NA 49260 38755 ...
$ Pupil.Teacher.Ratio..State..2023.24 : num 17.5 18.2 NA 22.7 12.5 ...
$ State.Name..State..2023.24 : num NA NA NA NA NA NA NA NA NA NA ...
..- attr(*, "problems")= tibble [57 × 4] (S3: tbl_df/tbl/data.frame)
.. ..$ row : int [1:57] 1 2 3 4 5 6 7 8 9 10 ...
.. ..$ col : int [1:57] NA NA NA NA NA NA NA NA NA NA ...
.. ..$ expected: chr [1:57] "a number" "a number" "a number" "a number" ...
.. ..$ actual : chr [1:57] "ALABAMA" "ALASKA" "AMERICAN SAMOA" "ARIZONA" ...
$ Total.Number.of.Districts.with.Enrollment..District..2023.24 : int 150 54 0 664 260 174 1984 185 201 42 ...
$ Total.Number.of.School.Districts..District..2023.24 : int 159 54 1 722 308 174 2113 272 210 45 ...
$ Total.Number.of.Public.Schools..Public.School..2023.24 : int 1527 498 29 2557 1106 174 10349 1908 1018 235 ...
$ Male.Students..State..2023.24 : num 384500 67646 NA 569863 248311 ...
$ Female.Students..State..2023.24 : num 364150 63597 NA 547767 236657 ...
$ Black.or.African.American...male..State..2023.24 : num 120605 1488 NA 33191 47530 ...
$ Black.or.African.American...female..State..2023.24 : num 115699 1467 NA 31747 46794 ...
$ White...male..State..2023.24 : num 197384 31786 NA 194292 144628 ...
$ White...female..State..2023.24 : num 184166 29716 NA 184028 135855 ...
$ Grade.12.Students...Black.or.African.American..State..2023.24 : num 15413 241 NA 5876 5828 ...
$ Secondary.Teachers..State..2023.24 : num 20189 3673 NA 15997 16388 ...
$ Elementary.Teachers..State..2023.24 : num 18395 2985 NA 30478 17007 ...
$ Full.Time.Equivalent..FTE..Staff..State..2023.24 : num 79964 16423 NA 109199 80417 ...
..- attr(*, "problems")= tibble [1 × 4] (S3: tbl_df/tbl/data.frame)
.. ..$ row : int 51
.. ..$ col : int NA
.. ..$ expected: chr "a number"
.. ..$ actual : chr "‡"
# what kinds of questions might you answer with this data set?