sqm-dashboards/app/services/dese/scraper.rb
Nelson Jovel 2561fa28fc feat: Split academic year into seasons if the academic year's range is
initialized with a season, i.e. "2024-25 Fall".  Update scapers for
admin data, enrollment and staffing to use the new range standard
correctly.   Update the loaders for admin data, enrollment and staffing
so that it populates all seasons in a given year.  So admin data for
2024-25 gets loaded into "2024-25 Fall" and "2024-25 Spring".  Add tests
for the new range format.  Set the default cutoff for the start of Spring season will be the last Sunday in February
2024-04-27 14:05:02 -07:00

89 lines
2.8 KiB
Ruby

module Dese
module Scraper
DELAY = 20 # The dese site will block you if you hit it too many times in a short period of time
Prerequisites = Struct.new("Prerequisites", :filepath, :url, :selectors, :submit_id, :admin_data_item_id,
:calculation)
def reverse_score(likert_score:)
return nil unless likert_score.present?
likert_score = 1 if likert_score < 1
likert_score = 5 if likert_score > 5
(likert_score - 6).abs
end
def run
academic_years = AcademicYear.all.order(range: :DESC)
.map(&:range_without_season)
.uniq
.map { |range| AcademicYear.new(range:) }
academic_years.each do |academic_year|
prerequisites = yield academic_year
document = get_html(url: prerequisites.url,
selectors: prerequisites.selectors,
submit_id: prerequisites.submit_id)
unless document.nil?
write_csv(document:, filepath: prerequisites.filepath, range: academic_year.range_without_season, id: prerequisites.admin_data_item_id,
calculation: prerequisites.calculation)
end
end
end
def browser
@browser ||= Watir::Browser.new
end
def get_html(url:, selectors:, submit_id:)
browser.goto(url)
selectors.each do |key, value|
next unless browser.option(text: value).present?
browser.select(id: key).select(text: value)
end
browser.button(id: submit_id).click
sleep DELAY # Sleep to prevent hitting mass.edu with too many requests
Nokogiri::HTML(browser.html)
end
def write_headers(filepath:, headers:)
CSV.open(filepath, "w") do |csv|
csv << headers
end
end
def write_csv(document:, filepath:, range:, id:, calculation:)
table = document.css("tr")
headers = document.css(".sorting")
header_hash = headers.each_with_index.map { |header, index| [header.text, index] }.to_h
CSV.open(filepath, "a") do |csv|
table.each do |row|
items = row.css("td").map(&:text)
dese_id = items[1].to_i
next if dese_id.nil? || dese_id.zero?
raw_likert_score = calculation.call(header_hash, items)
raw_likert_score ||= "NA"
likert_score = raw_likert_score
if likert_score != "NA"
likert_score = 5 if likert_score > 5
likert_score = 1 if likert_score < 1
likert_score = likert_score.round(2)
end
output = []
output << raw_likert_score
output << likert_score
output << id
output << range
output << items
output = output.flatten
csv << output
end
end
end
end
end