mirror of
https://github.com/edcommonwealth/sqm-dashboards.git
synced 2026-03-07 21:48:16 -08:00
instead of hard coded values. Produce a clean csv and a csv with all the removed values and columns with reason for removal. Add script for running cleaning for each project
86 lines
2.6 KiB
Ruby
86 lines
2.6 KiB
Ruby
module Dese
|
|
module Scraper
|
|
DELAY = 20 # The dese site will block you if you hit it too many times in a short period of time
|
|
|
|
Prerequisites = Struct.new('Prerequisites', :filepath, :url, :selectors, :submit_id, :admin_data_item_id,
|
|
:calculation)
|
|
def reverse_score(likert_score:)
|
|
return nil unless likert_score.present?
|
|
|
|
likert_score = 1 if likert_score < 1
|
|
likert_score = 5 if likert_score > 5
|
|
(likert_score - 6).abs
|
|
end
|
|
|
|
def run
|
|
academic_years = AcademicYear.all.order(range: :DESC)
|
|
academic_years.each do |academic_year|
|
|
prerequisites = yield academic_year
|
|
|
|
document = get_html(url: prerequisites.url,
|
|
selectors: prerequisites.selectors,
|
|
submit_id: prerequisites.submit_id)
|
|
unless document.nil?
|
|
write_csv(document:, filepath: prerequisites.filepath, range: academic_year.range, id: prerequisites.admin_data_item_id,
|
|
calculation: prerequisites.calculation)
|
|
end
|
|
end
|
|
end
|
|
|
|
def browser
|
|
@browser ||= Watir::Browser.new
|
|
end
|
|
|
|
def get_html(url:, selectors:, submit_id:)
|
|
browser.goto(url)
|
|
|
|
selectors.each do |key, value|
|
|
return unless browser.option(text: value).present?
|
|
|
|
browser.select(id: key).select(text: value)
|
|
end
|
|
|
|
browser.button(id: submit_id).click
|
|
sleep DELAY # Sleep to prevent hitting mass.edu with too many requests
|
|
Nokogiri::HTML(browser.html)
|
|
end
|
|
|
|
def write_headers(filepath:, headers:)
|
|
CSV.open(filepath, 'w') do |csv|
|
|
csv << headers
|
|
end
|
|
end
|
|
|
|
def write_csv(document:, filepath:, range:, id:, calculation:)
|
|
table = document.css('tr')
|
|
headers = document.css('.sorting')
|
|
header_hash = headers.each_with_index.map { |header, index| [header.text, index] }.to_h
|
|
|
|
CSV.open(filepath, 'a') do |csv|
|
|
table.each do |row|
|
|
items = row.css('td').map(&:text)
|
|
dese_id = items[1].to_i
|
|
next if dese_id.nil? || dese_id.zero?
|
|
|
|
raw_likert_score = calculation.call(header_hash, items)
|
|
raw_likert_score ||= 'NA'
|
|
likert_score = raw_likert_score
|
|
if likert_score != 'NA'
|
|
likert_score = 5 if likert_score > 5
|
|
likert_score = 1 if likert_score < 1
|
|
likert_score = likert_score.round(2)
|
|
end
|
|
|
|
output = []
|
|
output << raw_likert_score
|
|
output << likert_score
|
|
output << id
|
|
output << range
|
|
output << items
|
|
output = output.flatten
|
|
csv << output
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|