mirror of
https://github.com/edcommonwealth/Dashboard.git
synced 2026-03-17 09:46:25 -07:00
chore: get admin data scraper running
This commit is contained in:
parent
1810ee0074
commit
a142afe022
102 changed files with 41933 additions and 8756 deletions
|
|
@ -1,74 +1,76 @@
|
|||
require 'watir'
|
||||
require 'csv'
|
||||
require "watir"
|
||||
require "csv"
|
||||
|
||||
# TODO: convert this to simpler format and add a run_all method
|
||||
module Dese
|
||||
class FourDOne
|
||||
attr_reader :filepath
|
||||
module Dashboard
|
||||
module Dese
|
||||
class FourDOne
|
||||
attr_reader :filepath
|
||||
|
||||
def initialize(filepath: Rails.root.join('data', 'admin_data', 'dese', '4D_1_plans_of_grads.csv'))
|
||||
@filepath = filepath
|
||||
end
|
||||
|
||||
def run_all
|
||||
url = 'https://profiles.doe.mass.edu/statereport/plansofhsgrads.aspx'
|
||||
browser = Watir::Browser.new
|
||||
write_headers(filepath:)
|
||||
academic_years = AcademicYear.all
|
||||
academic_years.each do |academic_year|
|
||||
table = scrape(browser:, url:, range: academic_year.range)
|
||||
id = 'a-cgpr-i1'
|
||||
write_csv(table:, filepath:, range: academic_year.range, id:) unless table.nil?
|
||||
def initialize(filepath: Dashboard::Engine.root.join("data", "admin_data", "dese", "4D_1_plans_of_grads.csv"))
|
||||
@filepath = filepath
|
||||
end
|
||||
browser.close
|
||||
end
|
||||
|
||||
def scrape(browser:, url:, range:)
|
||||
browser.goto(url)
|
||||
|
||||
return unless browser.option(text: range).present?
|
||||
|
||||
browser.select(id: 'ctl00_ContentPlaceHolder1_ddReportType').select(/School/)
|
||||
browser.select(id: 'ctl00_ContentPlaceHolder1_ddYear').select(text: range)
|
||||
browser.button(id: 'btnViewReport').click
|
||||
sleep Dese::Scraper::DELAY # Sleep to prevent hitting mass.edu with too many requests
|
||||
document = Nokogiri::HTML(browser.html)
|
||||
document.css('tr')
|
||||
end
|
||||
|
||||
def write_headers(filepath:)
|
||||
CSV.open(filepath, 'w') do |csv|
|
||||
headers = ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID', '4 Year Private College', '4 Year Public College', '2 Year Private College', '2 Year Public College',
|
||||
'Other Post Secondary', 'Apprenticeship', 'Work', 'Military', 'Other', 'Unknown', 'Total']
|
||||
csv << headers
|
||||
def run_all
|
||||
url = "https://profiles.doe.mass.edu/statereport/plansofhsgrads.aspx"
|
||||
browser = Watir::Browser.new
|
||||
write_headers(filepath:)
|
||||
academic_years = AcademicYear.all
|
||||
academic_years.each do |academic_year|
|
||||
table = scrape(browser:, url:, range: academic_year.range)
|
||||
id = "a-cgpr-i1"
|
||||
write_csv(table:, filepath:, range: academic_year.range, id:) unless table.nil?
|
||||
end
|
||||
browser.close
|
||||
end
|
||||
end
|
||||
|
||||
def write_csv(table:, filepath:, range:, id:)
|
||||
CSV.open(filepath, 'a') do |csv|
|
||||
table.each do |row|
|
||||
items = row.css('td').map(&:text)
|
||||
dese_id = items[1].to_i
|
||||
next if dese_id.nil? || dese_id.zero?
|
||||
def scrape(browser:, url:, range:)
|
||||
browser.goto(url)
|
||||
|
||||
raw_likert_score = calculate(cells: items)
|
||||
likert_score = raw_likert_score
|
||||
likert_score = 5 if raw_likert_score > 5
|
||||
likert_score = 1 if raw_likert_score < 1
|
||||
likert_score = likert_score.round(2)
|
||||
output = []
|
||||
output << raw_likert_score
|
||||
output << likert_score
|
||||
output << id
|
||||
output << range
|
||||
output << items
|
||||
csv << output.flatten
|
||||
return unless browser.option(text: range).present?
|
||||
|
||||
browser.select(id: "ctl00_ContentPlaceHolder1_ddReportType").select(/School/)
|
||||
browser.select(id: "ctl00_ContentPlaceHolder1_ddYear").select(text: range)
|
||||
browser.button(id: "btnViewReport").click
|
||||
sleep Dese::Scraper::DELAY # Sleep to prevent hitting mass.edu with too many requests
|
||||
document = Nokogiri::HTML(browser.html)
|
||||
document.css("tr")
|
||||
end
|
||||
|
||||
def write_headers(filepath:)
|
||||
CSV.open(filepath, "w") do |csv|
|
||||
headers = ["Raw likert calculation", "Likert Score", "Admin Data Item", "Academic Year", "School Name", "DESE ID", "4 Year Private College", "4 Year Public College", "2 Year Private College", "2 Year Public College",
|
||||
"Other Post Secondary", "Apprenticeship", "Work", "Military", "Other", "Unknown", "Total"]
|
||||
csv << headers
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def calculate(cells:)
|
||||
(cells[2].to_f + cells[3].to_f + cells[4].to_f + cells[5].to_f + cells[6].to_f + cells[7].to_f + cells[8].to_f) * 4 / 75
|
||||
def write_csv(table:, filepath:, range:, id:)
|
||||
CSV.open(filepath, "a") do |csv|
|
||||
table.each do |row|
|
||||
items = row.css("td").map(&:text)
|
||||
dese_id = items[1].to_i
|
||||
next if dese_id.nil? || dese_id.zero?
|
||||
|
||||
raw_likert_score = calculate(cells: items)
|
||||
likert_score = raw_likert_score
|
||||
likert_score = 5 if raw_likert_score > 5
|
||||
likert_score = 1 if raw_likert_score < 1
|
||||
likert_score = likert_score.round(2)
|
||||
output = []
|
||||
output << raw_likert_score
|
||||
output << likert_score
|
||||
output << id
|
||||
output << range
|
||||
output << items
|
||||
csv << output.flatten
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def calculate(cells:)
|
||||
(cells[2].to_f + cells[3].to_f + cells[4].to_f + cells[5].to_f + cells[6].to_f + cells[7].to_f + cells[8].to_f) * 4 / 75
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue