chore: get admin data scraper running

This commit is contained in:
Nelson Jovel 2024-02-07 11:47:25 -08:00
parent 1810ee0074
commit a142afe022
102 changed files with 41933 additions and 8756 deletions

View file

@ -1,115 +1,117 @@
require 'watir'
require 'csv'
# TODO: convert this to simpler format and add a run_all method
module Dese
class OneAOne
attr_reader :filepath
require "watir"
require "csv"
def initialize(filepath: Rails.root.join('data', 'admin_data', 'dese', '1A_1_teacher_data.csv'))
@filepath = filepath
end
module Dashboard
module Dese
class OneAOne
attr_reader :filepath
def run_all
url = 'https://profiles.doe.mass.edu/statereport/teacherdata.aspx'
browser = Watir::Browser.new
write_headers(filepath:)
academic_years = AcademicYear.all
academic_years.each do |academic_year|
document = scrape(browser:, url:, range: academic_year.range)
id = 'a-exp-i1'
write_csv(document:, filepath:, range: academic_year.range, id:) unless document.nil?
def initialize(filepath: Dashboard::Engine.root.join("data", "admin_data", "dese", "1A_1_teacher_data.csv"))
@filepath = filepath
end
browser.close
end
def scrape(browser:, url:, range:)
browser.goto(url)
return unless browser.option(text: 'School').present?
return unless browser.option(text: range).present?
browser.select(id: 'ctl00_ContentPlaceHolder1_ddReportType').select(text: 'School')
browser.select(id: 'ctl00_ContentPlaceHolder1_ddYear').select(text: range)
browser.button(id: 'ctl00_ContentPlaceHolder1_btnViewReport').click
sleep Dese::Scraper::DELAY # Sleep to prevent hitting mass.edu with too many requests
Nokogiri::HTML(browser.html)
end
def write_headers(filepath:)
CSV.open(filepath, 'w') do |csv|
headers = ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID', 'Total # of Teachers(FTE)', 'Percent of Teachers Licensed',
'Student/Teacher Ratio', 'Percent of Experienced Teachers', 'Percent of Teachers without Waiver or Provisional License', 'Percent Teaching in-field']
csv << headers
def run_all
url = "https://profiles.doe.mass.edu/statereport/teacherdata.aspx"
browser = Watir::Browser.new
write_headers(filepath:)
academic_years = AcademicYear.all
academic_years.each do |academic_year|
document = scrape(browser:, url:, range: academic_year.range)
id = "a-exp-i1"
write_csv(document:, filepath:, range: academic_year.range, id:) unless document.nil?
end
browser.close
end
end
def write_csv(document:, filepath:, range:, id:)
table = document.css('tr')
headers = document.css('.sorting')
header_hash = headers.each_with_index.map { |header, index| [header.text, index] }.to_h
experienced_teacher_index = header_hash['Percent of Experienced Teachers']
dese_id_index = header_hash['School Code']
def scrape(browser:, url:, range:)
browser.goto(url)
CSV.open(filepath, 'a') do |csv|
table.each do |row|
items = row.css('td').map(&:text)
dese_id = items[1].to_i
next if dese_id.nil? || dese_id.zero?
return unless browser.option(text: "School").present?
return unless browser.option(text: range).present?
raw_likert_score = items[experienced_teacher_index].to_f * 4 / 80 if experienced_teacher_index.present?
raw_likert_score ||= 'NA'
likert_score = raw_likert_score
if likert_score != 'NA'
likert_score = 5 if likert_score > 5
likert_score = 1 if likert_score < 1
likert_score = likert_score.round(2)
end
browser.select(id: "ctl00_ContentPlaceHolder1_ddReportType").select(text: "School")
browser.select(id: "ctl00_ContentPlaceHolder1_ddYear").select(text: range)
browser.button(id: "ctl00_ContentPlaceHolder1_btnViewReport").click
sleep Dese::Scraper::DELAY # Sleep to prevent hitting mass.edu with too many requests
Nokogiri::HTML(browser.html)
end
output = []
output << raw_likert_score
output << likert_score
output << 'a-exp-i1'
output << range
output << items
output = output.flatten
csv << output
def write_headers(filepath:)
CSV.open(filepath, "w") do |csv|
headers = ["Raw likert calculation", "Likert Score", "Admin Data Item", "Academic Year", "School Name", "DESE ID", "Total # of Teachers(FTE)", "Percent of Teachers Licensed",
"Student/Teacher Ratio", "Percent of Experienced Teachers", "Percent of Teachers without Waiver or Provisional License", "Percent Teaching in-field"]
csv << headers
end
end
in_field_index = header_hash['Percent Teaching In-Field']
def write_csv(document:, filepath:, range:, id:)
table = document.css("tr")
headers = document.css(".sorting")
header_hash = headers.each_with_index.map { |header, index| [header.text, index] }.to_h
experienced_teacher_index = header_hash["Percent of Experienced Teachers"]
dese_id_index = header_hash["School Code"]
CSV.open(filepath, 'a') do |csv|
table.each do |row|
items = row.css('td').map(&:text)
dese_id = items[dese_id_index].to_i
next if dese_id.nil? || dese_id.zero?
CSV.open(filepath, "a") do |csv|
table.each do |row|
items = row.css("td").map(&:text)
dese_id = items[1].to_i
next if dese_id.nil? || dese_id.zero?
percent_in_field = items[in_field_index].to_f if in_field_index.present?
if in_field_index.present? && percent_in_field.present? && !percent_in_field.zero?
raw_likert_score = percent_in_field * 4 / 95
end
raw_likert_score ||= 'NA'
likert_score = raw_likert_score
if likert_score != 'NA'
likert_score = 5 if likert_score > 5
likert_score = 1 if likert_score < 1
likert_score = likert_score.round(2)
raw_likert_score = items[experienced_teacher_index].to_f * 4 / 80 if experienced_teacher_index.present?
raw_likert_score ||= "NA"
likert_score = raw_likert_score
if likert_score != "NA"
likert_score = 5 if likert_score > 5
likert_score = 1 if likert_score < 1
likert_score = likert_score.round(2)
end
output = []
output << raw_likert_score
output << likert_score
output << "a-exp-i1"
output << range
output << items
output = output.flatten
csv << output
end
end
output = []
output << raw_likert_score
output << likert_score
output << 'a-exp-i3'
output << range
output << items
output = output.flatten
csv << output
in_field_index = header_hash["Percent Teaching In-Field"]
CSV.open(filepath, "a") do |csv|
table.each do |row|
items = row.css("td").map(&:text)
dese_id = items[dese_id_index].to_i
next if dese_id.nil? || dese_id.zero?
percent_in_field = items[in_field_index].to_f if in_field_index.present?
if in_field_index.present? && percent_in_field.present? && !percent_in_field.zero?
raw_likert_score = percent_in_field * 4 / 95
end
raw_likert_score ||= "NA"
likert_score = raw_likert_score
if likert_score != "NA"
likert_score = 5 if likert_score > 5
likert_score = 1 if likert_score < 1
likert_score = likert_score.round(2)
end
output = []
output << raw_likert_score
output << likert_score
output << "a-exp-i3"
output << range
output << items
output = output.flatten
csv << output
end
end
end
end
def calculate(cells:)
cells[5].to_f * 4 / 95
def calculate(cells:)
cells[5].to_f * 4 / 95
end
end
end
end