chore: scrape enrollment and staffing data

This commit is contained in:
Nelson Jovel 2024-02-06 14:51:00 -08:00
parent 725348bf95
commit 1810ee0074
7 changed files with 15026 additions and 13173 deletions

View file

@ -1,23 +1,24 @@
require 'watir' require "watir"
require 'csv' require "csv"
module Dashboard
module Dese module Dese
module Enrollments module Enrollments
include Dese::Scraper include Dashboard::Dese::Scraper
attr_reader :filepaths attr_reader :filepaths
def scrape_enrollments(filepath:) def scrape_enrollments(filepath:)
headers = ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID', headers = ["Raw likert calculation", "Likert Score", "Admin Data Item", "Academic Year", "School Name", "DESE ID",
'PK', 'K', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', 'SP', 'Total'] "PK", "K", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "SP", "Total"]
write_headers(filepath:, headers:) write_headers(filepath:, headers:)
run do |academic_year| run do |academic_year|
admin_data_item_id = '' admin_data_item_id = ""
url = 'https://profiles.doe.mass.edu/statereport/enrollmentbygrade.aspx' url = "https://profiles.doe.mass.edu/statereport/enrollmentbygrade.aspx"
range = academic_year.range range = academic_year.range
selectors = { 'ctl00_ContentPlaceHolder1_ddReportType' => 'School', selectors = { "ctl00_ContentPlaceHolder1_ddReportType" => "School",
'ctl00_ContentPlaceHolder1_ddYear' => range } "ctl00_ContentPlaceHolder1_ddYear" => range }
submit_id = 'btnViewReport' submit_id = "btnViewReport"
calculation = ->(_headers, _items) { 'NA' } calculation = ->(_headers, _items) { "NA" }
Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation) Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
end end
end end
@ -26,9 +27,9 @@ module Dese
@students ||= {} @students ||= {}
if @students.count == 0 if @students.count == 0
CSV.parse(File.read(filepath), headers: true).map do |row| CSV.parse(File.read(filepath), headers: true).map do |row|
academic_year = row['Academic Year'] academic_year = row["Academic Year"]
school_id = row['DESE ID'].to_i school_id = row["DESE ID"].to_i
total = row['Total'].gsub(',', '').to_i total = row["Total"].gsub(",", "").to_i
@students[[school_id, academic_year]] = total @students[[school_id, academic_year]] = total
end end
end end
@ -36,3 +37,4 @@ module Dese
end end
end end
end end
end

View file

@ -1,8 +1,9 @@
module Dashboard
module Dese module Dese
module Scraper module Scraper
DELAY = 20 DELAY = 20
Prerequisites = Struct.new('Prerequisites', :filepath, :url, :selectors, :submit_id, :admin_data_item_id, Prerequisites = Struct.new("Prerequisites", :filepath, :url, :selectors, :submit_id, :admin_data_item_id,
:calculation) :calculation)
def run def run
academic_years = AcademicYear.all.order(range: :DESC) academic_years = AcademicYear.all.order(range: :DESC)
@ -38,26 +39,26 @@ module Dese
end end
def write_headers(filepath:, headers:) def write_headers(filepath:, headers:)
CSV.open(filepath, 'w') do |csv| CSV.open(filepath, "w") do |csv|
csv << headers csv << headers
end end
end end
def write_csv(document:, filepath:, range:, id:, calculation:) def write_csv(document:, filepath:, range:, id:, calculation:)
table = document.css('tr') table = document.css("tr")
headers = document.css('.sorting') headers = document.css(".sorting")
header_hash = headers.each_with_index.map { |header, index| [header.text, index] }.to_h header_hash = headers.each_with_index.map { |header, index| [header.text, index] }.to_h
CSV.open(filepath, 'a') do |csv| CSV.open(filepath, "a") do |csv|
table.each do |row| table.each do |row|
items = row.css('td').map(&:text) items = row.css("td").map(&:text)
dese_id = items[1].to_i dese_id = items[1].to_i
next if dese_id.nil? || dese_id.zero? next if dese_id.nil? || dese_id.zero?
raw_likert_score = calculation.call(header_hash, items) raw_likert_score = calculation.call(header_hash, items)
raw_likert_score ||= 'NA' raw_likert_score ||= "NA"
likert_score = raw_likert_score likert_score = raw_likert_score
if likert_score != 'NA' if likert_score != "NA"
likert_score = 5 if likert_score > 5 likert_score = 5 if likert_score > 5
likert_score = 1 if likert_score < 1 likert_score = 1 if likert_score < 1
likert_score = likert_score.round(2) likert_score = likert_score.round(2)
@ -76,3 +77,4 @@ module Dese
end end
end end
end end
end

View file

@ -1,11 +1,12 @@
require 'watir' require "watir"
module Dashboard
module Dese module Dese
class Staffing class Staffing
include Dese::Scraper include Dashboard::Dese::Scraper
attr_reader :filepath attr_reader :filepath
def initialize(filepath: Rails.root.join('data', 'staffing', 'staffing.csv')) def initialize(filepath: Dashboard::Engine.root.join("data", "dashboard", "staffing", "staffing.csv"))
@filepath = filepath @filepath = filepath
end end
@ -14,22 +15,23 @@ module Dese
end end
def scrape_staffing(filepath:) def scrape_staffing(filepath:)
headers = ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', headers = ["Raw likert calculation", "Likert Score", "Admin Data Item", "Academic Year",
'School Name', 'DESE ID', "School Name", "DESE ID",
'PK-2 (#)', '3-5 (#)', '6-8 (#)', '9-12 (#)', 'Multiple Grades (#)', "PK-2 (#)", "3-5 (#)", "6-8 (#)", "9-12 (#)", "Multiple Grades (#)",
'All Grades (#)', 'FTE Count'] "All Grades (#)", "FTE Count"]
write_headers(filepath:, headers:) write_headers(filepath:, headers:)
run do |academic_year| run do |academic_year|
admin_data_item_id = 'NA' admin_data_item_id = "NA"
url = 'https://profiles.doe.mass.edu/statereport/gradesubjectstaffing.aspx' url = "https://profiles.doe.mass.edu/statereport/gradesubjectstaffing.aspx"
range = academic_year.range range = academic_year.range
selectors = { 'ctl00_ContentPlaceHolder1_ddReportType' => 'School', selectors = { "ctl00_ContentPlaceHolder1_ddReportType" => "School",
'ctl00_ContentPlaceHolder1_ddYear' => range, "ctl00_ContentPlaceHolder1_ddYear" => range,
'ctl00_ContentPlaceHolder1_ddDisplay' => 'Full-time Equivalents' } "ctl00_ContentPlaceHolder1_ddDisplay" => "Full-time Equivalents" }
submit_id = 'btnViewReport' submit_id = "btnViewReport"
calculation = ->(_headers, _items) { 'NA' } calculation = ->(_headers, _items) { "NA" }
Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation) Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
end end
end end
end end
end end
end

View file

@ -1,15 +1,19 @@
require 'watir' require "watir"
require 'csv' require "csv"
module Dashboard
module Dese module Dese
class ThreeATwo class ThreeATwo
include Dese::Scraper include Dashboard::Dese::Scraper
include Dese::Enrollments include Dashboard::Dese::Enrollments
attr_reader :filepaths attr_reader :filepaths
def initialize(filepaths: [Rails.root.join('data', 'admin_data', 'dese', 'enrollments.csv'), def initialize(filepaths:
Rails.root.join('data', 'admin_data', 'dese', '3A_2_age_staffing.csv'), [Dashboard::Engine.root.join("data", "dashboard", "admin_data", "dese", "enrollments.csv"),
Rails.root.join('data', 'admin_data', 'dese', '3A_2_grade_subject_staffing.csv')]) Dashboard::Engine.root.join("data", "dashboard", "admin_data", "dese",
"3A_2_age_staffing.csv"),
Dashboard::Engine.root.join("data", "dashboard", "admin_data", "dese",
"3A_2_grade_subject_staffing.csv")])
@filepaths = filepaths @filepaths = filepaths
end end
@ -32,43 +36,43 @@ module Dese
end end
def write_a_sust_i1_headers(filepath:) def write_a_sust_i1_headers(filepath:)
headers = ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID', headers = ["Raw likert calculation", "Likert Score", "Admin Data Item", "Academic Year", "School Name", "DESE ID",
'<26 yrs (# )', '26-32 yrs (#)', '33-40 yrs (#)', '41-48 yrs (#)', "<26 yrs (# )", "26-32 yrs (#)", "33-40 yrs (#)", "41-48 yrs (#)",
'49-56 yrs (#)', '57-64 yrs (#)', 'Over 64 yrs (#)', 'FTE Count', "49-56 yrs (#)", "57-64 yrs (#)", "Over 64 yrs (#)", "FTE Count",
'Student Count', 'Student to Guidance Counselor ratio'] "Student Count", "Student to Guidance Counselor ratio"]
write_headers(filepath:, headers:) write_headers(filepath:, headers:)
end end
def write_a_sust_i4_headers(filepath:) def write_a_sust_i4_headers(filepath:)
headers = ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID', headers = ["Raw likert calculation", "Likert Score", "Admin Data Item", "Academic Year", "School Name", "DESE ID",
'PK-2 (# )', '3-5 (# )', '6-8 (# )', '9-12 (# )', 'Multiple Grades (# )', 'All Grades (# )', 'FTE Count', "PK-2 (# )", "3-5 (# )", "6-8 (# )", "9-12 (# )", "Multiple Grades (# )", "All Grades (# )", "FTE Count",
'Student Count', 'Student to Art Teacher ratio'] "Student Count", "Student to Art Teacher ratio"]
write_headers(filepath:, headers:) write_headers(filepath:, headers:)
end end
def run_a_sust_i1(filepath:) def run_a_sust_i1(filepath:)
run do |academic_year| run do |academic_year|
admin_data_item_id = 'a-sust-i1' admin_data_item_id = "a-sust-i1"
url = 'https://profiles.doe.mass.edu/statereport/agestaffing.aspx' url = "https://profiles.doe.mass.edu/statereport/agestaffing.aspx"
range = academic_year.range range = academic_year.range
selectors = { 'ctl00_ContentPlaceHolder1_ddReportType' => 'School', selectors = { "ctl00_ContentPlaceHolder1_ddReportType" => "School",
'ctl00_ContentPlaceHolder1_ddYear' => range, "ctl00_ContentPlaceHolder1_ddYear" => range,
'ctl00_ContentPlaceHolder1_ddJobClassification' => 'Guidance Counselor' } "ctl00_ContentPlaceHolder1_ddJobClassification" => "Guidance Counselor" }
submit_id = 'btnViewReport' submit_id = "btnViewReport"
calculation = lambda { |headers, items| calculation = lambda { |headers, items|
fte_index = headers['FTE Count'] fte_index = headers["FTE Count"]
num_of_guidance_counselors = items[fte_index].to_f num_of_guidance_counselors = items[fte_index].to_f
dese_id = items[headers['School Code']].to_i dese_id = items[headers["School Code"]].to_i
school = School.find_by_dese_id(dese_id) school = School.find_by_dese_id(dese_id)
return 'NA' unless school.present? && school.is_hs? return "NA" unless school.present? && school.is_hs?
num_of_students = student_count(filepath: filepaths[0], dese_id:, year: academic_year.range) || 0 num_of_students = student_count(filepath: filepaths[0], dese_id:, year: academic_year.range) || 0
items << num_of_students items << num_of_students
benchmark = 250 benchmark = 250
if fte_index.present? && !items[fte_index] != '' if fte_index.present? && !items[fte_index] != ""
result = ((benchmark - (num_of_students / num_of_guidance_counselors)) + benchmark) * 4 / benchmark result = ((benchmark - (num_of_students / num_of_guidance_counselors)) + benchmark) * 4 / benchmark
end end
items << (num_of_students / num_of_guidance_counselors) items << (num_of_students / num_of_guidance_counselors)
@ -80,21 +84,21 @@ module Dese
def run_a_sust_i2(filepath:) def run_a_sust_i2(filepath:)
run do |academic_year| run do |academic_year|
admin_data_item_id = 'a-sust-i2' admin_data_item_id = "a-sust-i2"
url = 'https://profiles.doe.mass.edu/statereport/agestaffing.aspx' url = "https://profiles.doe.mass.edu/statereport/agestaffing.aspx"
range = academic_year.range range = academic_year.range
selectors = { 'ctl00_ContentPlaceHolder1_ddReportType' => 'School', selectors = { "ctl00_ContentPlaceHolder1_ddReportType" => "School",
'ctl00_ContentPlaceHolder1_ddYear' => range, "ctl00_ContentPlaceHolder1_ddYear" => range,
'ctl00_ContentPlaceHolder1_ddJobClassification' => 'School Psychologist -- Non-Special Education' } "ctl00_ContentPlaceHolder1_ddJobClassification" => "School Psychologist -- Non-Special Education" }
submit_id = 'btnViewReport' submit_id = "btnViewReport"
calculation = lambda { |headers, items| calculation = lambda { |headers, items|
fte_index = headers['FTE Count'] fte_index = headers["FTE Count"]
num_of_psychologists = items[fte_index].to_f num_of_psychologists = items[fte_index].to_f
dese_id = items[headers['School Code']].to_i dese_id = items[headers["School Code"]].to_i
num_of_students = student_count(filepath: filepaths[0], dese_id:, year: academic_year.range) || 0 num_of_students = student_count(filepath: filepaths[0], dese_id:, year: academic_year.range) || 0
items << num_of_students items << num_of_students
benchmark = 250 benchmark = 250
if fte_index.present? && !items[fte_index] != '' if fte_index.present? && !items[fte_index] != ""
result = ((benchmark - (num_of_students / num_of_psychologists)) + benchmark) * 4 / benchmark result = ((benchmark - (num_of_students / num_of_psychologists)) + benchmark) * 4 / benchmark
end end
@ -107,21 +111,21 @@ module Dese
def run_a_sust_i3(filepath:) def run_a_sust_i3(filepath:)
run do |academic_year| run do |academic_year|
admin_data_item_id = 'a-sust-i3' admin_data_item_id = "a-sust-i3"
url = 'https://profiles.doe.mass.edu/statereport/agestaffing.aspx' url = "https://profiles.doe.mass.edu/statereport/agestaffing.aspx"
range = academic_year.range range = academic_year.range
selectors = { 'ctl00_ContentPlaceHolder1_ddReportType' => 'School', selectors = { "ctl00_ContentPlaceHolder1_ddReportType" => "School",
'ctl00_ContentPlaceHolder1_ddYear' => range, "ctl00_ContentPlaceHolder1_ddYear" => range,
'ctl00_ContentPlaceHolder1_ddJobClassification' => 'Paraprofessional' } "ctl00_ContentPlaceHolder1_ddJobClassification" => "Paraprofessional" }
submit_id = 'btnViewReport' submit_id = "btnViewReport"
calculation = lambda { |headers, items| calculation = lambda { |headers, items|
fte_index = headers['FTE Count'] fte_index = headers["FTE Count"]
num_of_paraprofessionals = items[fte_index].to_f num_of_paraprofessionals = items[fte_index].to_f
dese_id = items[headers['School Code']].to_i dese_id = items[headers["School Code"]].to_i
num_of_students = student_count(filepath: filepaths[0], dese_id:, year: academic_year.range) || 0 num_of_students = student_count(filepath: filepaths[0], dese_id:, year: academic_year.range) || 0
items << num_of_students items << num_of_students
benchmark = 43.4 benchmark = 43.4
if fte_index.present? && !items[fte_index] != '' if fte_index.present? && !items[fte_index] != ""
result = ((benchmark - (num_of_students / num_of_paraprofessionals)) + benchmark) * 4 / benchmark result = ((benchmark - (num_of_students / num_of_paraprofessionals)) + benchmark) * 4 / benchmark
end end
@ -134,15 +138,15 @@ module Dese
def run_a_sust_i4(filepath:) def run_a_sust_i4(filepath:)
run do |academic_year| run do |academic_year|
admin_data_item_id = 'a-sust-i4' admin_data_item_id = "a-sust-i4"
url = 'https://profiles.doe.mass.edu/statereport/gradesubjectstaffing.aspx' url = "https://profiles.doe.mass.edu/statereport/gradesubjectstaffing.aspx"
range = academic_year.range range = academic_year.range
selectors = { 'ctl00_ContentPlaceHolder1_ddReportType' => 'School', selectors = { "ctl00_ContentPlaceHolder1_ddReportType" => "School",
'ctl00_ContentPlaceHolder1_ddYear' => range, "ctl00_ContentPlaceHolder1_ddYear" => range,
'ctl00_ContentPlaceHolder1_ddDisplay' => 'Full-time Equivalents', "ctl00_ContentPlaceHolder1_ddDisplay" => "Full-time Equivalents",
'ctl00_ContentPlaceHolder1_ddSubject' => 'Arts' } "ctl00_ContentPlaceHolder1_ddSubject" => "Arts" }
submit_id = 'btnViewReport' submit_id = "btnViewReport"
calculation = lambda { |_headers, items| calculation = lambda { |_headers, items|
num_of_art_teachers = items.last.to_f num_of_art_teachers = items.last.to_f
dese_id = items[1].to_i dese_id = items[1].to_i
@ -161,3 +165,4 @@ module Dese
end end
end end
end end
end

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

24
lib/tasks/scrape.rake Normal file
View file

@ -0,0 +1,24 @@
namespace :dashboard do
namespace :scrape do
desc "scrape dese site for admin data"
task admin: :environment do
puts "scraping data from dese"
scrapers = [Dese::OneAOne, Dese::OneAThree, Dese::TwoAOne, Dese::TwoCOne, Dese::ThreeAOne, Dese::ThreeATwo,
Dese::ThreeBOne, Dese::ThreeBTwo, Dese::FourAOne, Dese::FourBTwo, Dese::FourDOne, Dese::FiveCOne, Dese::FiveDTwo]
scrapers.each do |scraper|
scraper.new.run_all
end
end
desc "scrape dese site for teacher staffing information"
task enrollment: :environment do
Dese::ThreeATwo.new.scrape_enrollments(filepath: Dashboard::Engine.root.join("data", "dashboard", "enrollment",
"enrollment.csv"))
end
desc "scrape dese site for student staffing information"
task staffing: :environment do
Dashboard::Dese::Staffing.new.run_all
end
end
end