Add scraper for 3A-1 and 2C-1. Finishes #183105787 and #183105716

pull/1/head
rebuilt 3 years ago
parent b5bc9c9de4
commit 83ef9310a4

@ -25,7 +25,7 @@ module Dese
browser.select(id: 'ctl00_ContentPlaceHolder1_ddReportType').select(text: 'School')
browser.select(id: 'ctl00_ContentPlaceHolder1_ddYear').select(text: range)
browser.button(id: 'ctl00_ContentPlaceHolder1_btnViewReport').click
sleep 2 # Sleep to prevent hitting mass.edu with too many requests
sleep 3 # Sleep to prevent hitting mass.edu with too many requests
Nokogiri::HTML(browser.html)
end

@ -0,0 +1,83 @@
module Dese
module Scraper
DELAY = 3
def reverse_score(likert_score:)
return nil unless likert_score.present?
likert_score = 1 if likert_score < 1
likert_score = 5 if likert_score > 5
(likert_score - 6).abs
end
def run
academic_years = AcademicYear.all.order(range: :DESC)
academic_years.each do |academic_year|
prerequisites = yield academic_year
document = get_html(url: prerequisites.url,
selectors: prerequisites.selectors,
submit_id: prerequisites.submit_id)
unless document.nil?
write_csv(document:, filepath: prerequisites.filepath, range: academic_year.range, id: prerequisites.admin_data_item_id,
calculation: prerequisites.calculation)
end
end
end
def browser
@browser ||= Watir::Browser.new
end
def get_html(url:, selectors:, submit_id:)
browser.goto(url)
selectors.each do |key, value|
return unless browser.option(text: value).present?
browser.select(id: key).select(text: value)
end
browser.button(id: submit_id).click
sleep DELAY # Sleep to prevent hitting mass.edu with too many requests
Nokogiri::HTML(browser.html)
end
def write_headers(filepath:, headers:)
CSV.open(filepath, 'w') do |csv|
csv << headers
end
end
def write_csv(document:, filepath:, range:, id:, calculation:)
table = document.css('tr')
headers = document.css('.sorting')
header_hash = headers.each_with_index.map { |header, index| [header.text, index] }.to_h
CSV.open(filepath, 'a') do |csv|
table.each do |row|
items = row.css('td').map(&:text)
dese_id = items[1].to_i
next if dese_id.nil? || dese_id.zero?
raw_likert_score = calculation.call(header_hash, items)
raw_likert_score ||= 'NA'
likert_score = raw_likert_score
if likert_score != 'NA'
likert_score = 5 if likert_score > 5
likert_score = 1 if likert_score < 1
likert_score = likert_score.round(2)
end
output = []
output << raw_likert_score
output << likert_score
output << id
output << range
output << items
output = output.flatten
csv << output
end
end
end
end
end

@ -0,0 +1,46 @@
require 'watir'
require 'csv'
module Dese
class ThreeAOne
include Dese::Scraper
attr_reader :filepaths
Prerequisites = Struct.new('Prerequisites', :filepath, :url, :selectors, :submit_id, :admin_data_item_id,
:calculation)
def initialize(filepaths: [Rails.root.join('data', 'admin_data', 'dese', 'two_c_one_attendance.csv')])
@filepaths = filepaths
filepath = filepaths[0]
headers = ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID',
'Total # of Classes', 'Average Class Size', 'Number of Students', 'Female %', 'Male %',
'English Language Learner %', 'Students with Disabilities %', 'Economically Disadvantaged %']
write_headers(filepath:, headers:)
run_a_reso_i1
browser.close
end
def run_a_reso_i1
run do |academic_year|
url = 'https://profiles.doe.mass.edu/statereport/classsizebygenderpopulation.aspx'
range = academic_year.range
selectors = { 'ctl00_ContentPlaceHolder1_ddReportType' => 'School',
'ctl00_ContentPlaceHolder1_ddYear' => range }
submit_id = 'btnViewReport'
calculation = lambda { |headers, items|
class_size_index = headers['Average Class Size']
average_class_size = items[class_size_index].to_f
benchmark = 20
if class_size_index.present? && !items[class_size_index] != ''
((benchmark - average_class_size) + benchmark) * 4 / benchmark
end
}
admin_data_item_id = 'a-reso-i1'
Prerequisites.new(filepaths[0], url, selectors, submit_id, admin_data_item_id, calculation)
end
end
end
end

@ -3,6 +3,7 @@ require 'csv'
module Dese
class TwoAOneScraper
include Dese::Scraper
attr_reader :filepaths
Prerequisites = Struct.new('Prerequisites', :filepath, :url, :selectors, :submit_id, :admin_data_item_id,
@ -31,10 +32,12 @@ module Dese
'ctl00_ContentPlaceHolder1_ddYear' => range }
submit_id = 'ctl00_ContentPlaceHolder1_btnViewReport'
calculation = lambda { |headers, items|
suspensions = headers['% Out-of-School Suspension']
result = items[suspensions].to_f * 4 / 5.27 if suspensions.present?
reverse_score(likert_score: result)
suspensions_index = headers['% Out-of-School Suspension']
benchmark = 5.27
suspension_rate = items[suspensions_index].to_f
if suspensions_index.present? && items[suspensions_index] != ''
((benchmark - suspension_rate) + benchmark) * 4 / 5.27
end
}
admin_data_item_id = 'a-phys-i1'
Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
@ -53,93 +56,16 @@ module Dese
'ctl00_ContentPlaceHolder1_ddYear' => range }
submit_id = 'ctl00_ContentPlaceHolder1_btnViewReport'
calculation = lambda { |headers, items|
days_missed = headers['% > 10 Days']
result = items[days_missed].to_f * 4 if days_missed.present?
reverse_score(likert_score: result)
days_missed_index = headers['% > 10 Days']
benchmark = 1
missed_days = items[days_missed_index].to_f
if days_missed_index.present? && items[days_missed_index] != ''
((benchmark - missed_days) + benchmark) * 4 / benchmark
end
}
admin_data_item_id = 'a-phys-i3'
Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
end
end
def reverse_score(likert_score:)
return nil unless likert_score.present?
likert_score = 1 if likert_score < 1
likert_score = 5 if likert_score > 5
(likert_score - 6).abs
end
def run
academic_years = AcademicYear.all.order(range: :DESC)
academic_years.each do |academic_year|
prerequisites = yield academic_year
document = get_html(url: prerequisites.url,
selectors: prerequisites.selectors,
submit_id: prerequisites.submit_id)
unless document.nil?
write_csv(document:, filepath: prerequisites.filepath, range: academic_year.range, id: prerequisites.admin_data_item_id,
calculation: prerequisites.calculation)
end
end
end
def browser
@browser ||= Watir::Browser.new
end
def get_html(url:, selectors:, submit_id:)
browser.goto(url)
selectors.each do |key, value|
return unless browser.option(text: value).present?
browser.select(id: key).select(text: value)
end
browser.button(id: submit_id).click
sleep 2 # Sleep to prevent hitting mass.edu with too many requests
Nokogiri::HTML(browser.html)
end
def write_headers(filepath:, headers:)
CSV.open(filepath, 'w') do |csv|
csv << headers
end
end
def write_csv(document:, filepath:, range:, id:, calculation:)
table = document.css('tr')
headers = document.css('.sorting')
header_hash = headers.each_with_index.map { |header, index| [header.text, index] }.to_h
CSV.open(filepath, 'a') do |csv|
table.each do |row|
items = row.css('td').map(&:text)
dese_id = items[1].to_i
next if dese_id.nil? || dese_id.zero?
raw_likert_score = calculation.call(header_hash, items)
raw_likert_score ||= 'NA'
likert_score = raw_likert_score
if likert_score != 'NA'
likert_score = 5 if likert_score > 5
likert_score = 1 if likert_score < 1
likert_score = likert_score.round(2)
end
output = []
output << raw_likert_score
output << likert_score
output << id
output << range
output << items
output = output.flatten
csv << output
end
end
end
end
end

@ -0,0 +1,74 @@
require 'watir'
require 'csv'
module Dese
class TwoCOneScraper
include Dese::Scraper
attr_reader :filepaths
Prerequisites = Struct.new('Prerequisites', :filepath, :url, :selectors, :submit_id, :admin_data_item_id,
:calculation)
def initialize(filepaths: [Rails.root.join('data', 'admin_data', 'dese', 'two_c_one_attendance.csv')])
@filepaths = filepaths
filepath = filepaths[0]
headers = ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID',
'Attendance Rate', 'Average # of Absences', 'Absent 10 or more days', 'Chronically Absent (10% or more)',
'Chronically Absent (20% or more)', 'Unexcused > 9 days']
write_headers(filepath:, headers:)
run_a_vale_i1
run_a_vale_i2
browser.close
end
def run_a_vale_i1
run do |academic_year|
admin_data_item_id = 'a-vale-i1'
url = 'https://profiles.doe.mass.edu/statereport/attendance.aspx'
range = case academic_year.range
when '2021-22', '2020-21'
"#{academic_year.range} (End of year)"
else
academic_year.range
end
selectors = { 'ctl00_ContentPlaceHolder1_ddReportType' => 'School',
'ctl00_ContentPlaceHolder1_ddYear' => range }
submit_id = 'btnViewReport'
calculation = lambda { |headers, items|
absence_index = headers['Chronically Absent (10% or more)']
benchmark = 10
absence_rate = items[absence_index].to_f
if absence_index.present? && !items[absence_index].blank?
((benchmark - absence_rate) + benchmark) * 4 / benchmark
end
}
Prerequisites.new(filepaths[0], url, selectors, submit_id, admin_data_item_id, calculation)
end
end
def run_a_vale_i2
run do |academic_year|
admin_data_item_id = 'a-vale-i2'
url = 'https://profiles.doe.mass.edu/statereport/attendance.aspx'
range = case academic_year.range
when '2021-22', '2020-21'
"#{academic_year.range} (End of year)"
else
academic_year.range
end
selectors = { 'ctl00_ContentPlaceHolder1_ddReportType' => 'School',
'ctl00_ContentPlaceHolder1_ddYear' => range }
submit_id = 'btnViewReport'
calculation = lambda { |headers, items|
attendance = headers[' Attendance Rate ']
benchmark = 90
items[attendance].to_f * 4 / benchmark if attendance.present?
}
Prerequisites.new(filepaths[0], url, selectors, submit_id, admin_data_item_id, calculation)
end
end
end
end

File diff suppressed because it is too large Load Diff

@ -1,10 +0,0 @@
require 'watir'
require 'csv'
module Dese
class OneAScraper
def initialize(filepath: Rails.root.join('data', 'admin_data', 'dese', 'scraped.csv')); end
def run; end
end
end

@ -0,0 +1,57 @@
require 'rails_helper'
require 'fileutils'
require 'csv'
RSpec.describe Dese::ThreeAOne do
let(:academic_years) do
[
create(:academic_year, range: '2021-22'),
create(:academic_year, range: '2020-21'),
create(:academic_year, range: '2019-20'),
create(:academic_year, range: '2018-19'),
create(:academic_year, range: '2017-18'),
create(:academic_year, range: '2016-17')
]
end
let(:i1_filepath) { Rails.root.join('tmp', 'spec', 'dese', 'three_a_one_average_class_size.csv') }
let(:filepaths) do
[i1_filepath]
end
before do
FileUtils.mkdir_p 'tmp/spec/dese'
end
before :each do
academic_years
end
context 'Creating a new Scraper' do
it 'creates a csv file with the scraped data' do
Dese::ThreeAOne.new(filepaths:)
expect(i1_filepath).to exist
end
it 'has the correct headers' do
headers = File.open(i1_filepath) do |file|
headers = file.first
end.split(',')
expect(headers).to eq ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID',
'Total # of Classes', 'Average Class Size', 'Number of Students', 'Female %', 'Male %',
'English Language Learner %', 'Students with Disabilities %', "Economically Disadvantaged %\n"]
end
it 'has the right likert score results for a-reso-i1' do
results = CSV.parse(File.read(i1_filepath), headers: true).map do |row|
next unless row['Admin Data Item'] == 'a-reso-i1' && row['Academic Year'] == '2020-21'
row['Likert Score'].to_f
end.flatten.compact
expect(results.take(20)).to eq [4.22, 5.0, 4.58, 3.46, 3.98, 3.68, 4.06, 4.84, 4.42, 4.66, 5.0, 4.6, 4.26, 4.46,
4.2, 4.66, 5.0, 4.6, 4.28, 5.0]
end
end
end

@ -47,20 +47,21 @@ RSpec.describe Dese::TwoAOneScraper do
it 'has the right likert score results for a-phys-i1' do
results = CSV.parse(File.read(i1_filepath), headers: true).map do |row|
row['Likert Score'].to_f
end
row['Likert Score'].to_f unless row['Likert Score'] == 'NA' || row['Likert Score'].nil?
end.flatten.compact
expect(results.take(20)).to eq [5.0, 5.0, 4.33, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
expect(results.take(20)).to eq [5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
5.0, 5.0, 5.0, 5.0]
end
it 'has the right likert score results for a-exp-i3' do
results = CSV.parse(File.read(i3_filepath), headers: true).map do |row|
next unless row['Admin Data Item'] == 'a-phys-i3' && row['Academic Year'] == '2020-21'
row['Likert Score'].to_f
row['Likert Score'].to_f unless row['Likert Score'] == 'NA' || row['Likert Score'].nil?
end.flatten.compact
expect(results.take(20)).to eq [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
expect(results.take(20)).to eq [5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 3.6, 5.0,
5.0, 5.0, 5.0, 5.0]
end
end
end

@ -0,0 +1,67 @@
require 'rails_helper'
require 'fileutils'
require 'csv'
RSpec.describe Dese::TwoCOneScraper do
let(:academic_years) do
[
create(:academic_year, range: '2021-22'),
create(:academic_year, range: '2020-21'),
create(:academic_year, range: '2019-20'),
create(:academic_year, range: '2018-19'),
create(:academic_year, range: '2017-18'),
create(:academic_year, range: '2016-17')
]
end
let(:i1_filepath) { Rails.root.join('tmp', 'spec', 'dese', 'two_c_one_attendance.csv') }
let(:filepaths) do
[i1_filepath]
end
before do
FileUtils.mkdir_p 'tmp/spec/dese'
end
before :each do
academic_years
end
context 'Creating a new Scraper' do
it 'creates a csv file with the scraped data' do
Dese::TwoCOneScraper.new(filepaths:)
expect(i1_filepath).to exist
end
it 'has the correct headers' do
headers = File.open(i1_filepath) do |file|
headers = file.first
end.split(',')
expect(headers).to eq ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID',
'Attendance Rate', 'Average # of Absences', 'Absent 10 or more days', 'Chronically Absent (10% or more)',
'Chronically Absent (20% or more)', "Unexcused > 9 days\n"]
end
it 'has the right likert score results for a-vale-i1' do
results = CSV.parse(File.read(i1_filepath), headers: true).map do |row|
next unless row['Admin Data Item'] == 'a-vale-i1' && row['Academic Year'] == '2021-22'
row['Likert Score'].to_f
end.flatten.compact
expect(results.take(20)).to eq [1.88, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.16, 3.84, 1.0, 4.6, 3.64, 4.84, 2.68, 2.84,
3.08, 1.0, 2.56, 3.96, 1.0]
end
it 'has the right likert score results for a-vale-i2' do
results = CSV.parse(File.read(i1_filepath), headers: true).map do |row|
next unless row['Admin Data Item'] == 'a-vale-i2' && row['Academic Year'] == '2021-22'
row['Likert Score'].to_f
end.flatten.compact
expect(results.take(20)).to eq [4.2, 4.07, 4.11, 4.14, 4.0, 3.93, 3.92, 4.22, 4.21, 4.09, 4.24, 4.2, 4.26, 4.19,
4.2, 4.22, 4.16, 4.18, 4.23, 4.02]
end
end
end
Loading…
Cancel
Save