parent
048aacd641
commit
f783ce9724
@ -0,0 +1,109 @@
|
|||||||
|
require 'watir'
|
||||||
|
require 'csv'
|
||||||
|
|
||||||
|
module Dese
|
||||||
|
class OneAScraper
|
||||||
|
def initialize(filepath: Rails.root.join('data', 'admin_data', 'dese', 'one_a.csv'))
|
||||||
|
url = 'https://profiles.doe.mass.edu/statereport/teacherdata.aspx'
|
||||||
|
browser = Watir::Browser.new
|
||||||
|
write_headers(filepath:)
|
||||||
|
academic_years = AcademicYear.all
|
||||||
|
academic_years.each do |academic_year|
|
||||||
|
document = scrape(browser:, url:, range: academic_year.range)
|
||||||
|
id = 'a-exp-i1'
|
||||||
|
write_csv(document:, filepath:, range: academic_year.range, id:) unless document.nil?
|
||||||
|
end
|
||||||
|
browser.close
|
||||||
|
end
|
||||||
|
|
||||||
|
def scrape(browser:, url:, range:)
|
||||||
|
browser.goto(url)
|
||||||
|
|
||||||
|
return unless browser.option(text: 'School').present?
|
||||||
|
return unless browser.option(text: range).present?
|
||||||
|
|
||||||
|
browser.select(id: 'ctl00_ContentPlaceHolder1_ddReportType').select(text: 'School')
|
||||||
|
browser.select(id: 'ctl00_ContentPlaceHolder1_ddYear').select(text: range)
|
||||||
|
browser.button(id: 'ctl00_ContentPlaceHolder1_btnViewReport').click
|
||||||
|
sleep 2 # Sleep to prevent hitting mass.edu with too many requests
|
||||||
|
Nokogiri::HTML(browser.html)
|
||||||
|
end
|
||||||
|
|
||||||
|
def write_headers(filepath:)
|
||||||
|
CSV.open(filepath, 'w') do |csv|
|
||||||
|
headers = ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID', 'Total # of Teachers(FTE)', 'Percent of Teachers Licensed',
|
||||||
|
'Student/Teacher Ratio', 'Percent of Experienced Teachers', 'Percent of Teachers without Waiver or Provisional License', 'Percent Teaching in-field']
|
||||||
|
csv << headers
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def write_csv(document:, filepath:, range:, id:)
|
||||||
|
table = document.css('tr')
|
||||||
|
headers = document.css('.sorting')
|
||||||
|
header_hash = headers.each_with_index.map { |header, index| [header.text, index] }.to_h
|
||||||
|
experienced_teacher_index = header_hash['Percent of Experienced Teachers']
|
||||||
|
dese_id_index = header_hash['School Code']
|
||||||
|
|
||||||
|
CSV.open(filepath, 'a') do |csv|
|
||||||
|
table.each do |row|
|
||||||
|
items = row.css('td').map(&:text)
|
||||||
|
dese_id = items[1].to_i
|
||||||
|
next if dese_id.nil? || dese_id.zero?
|
||||||
|
|
||||||
|
raw_likert_score = items[experienced_teacher_index].to_f * 4 / 80 if experienced_teacher_index.present?
|
||||||
|
raw_likert_score ||= 'NA'
|
||||||
|
likert_score = raw_likert_score
|
||||||
|
if likert_score != 'NA'
|
||||||
|
likert_score = 5 if likert_score > 5
|
||||||
|
likert_score = 1 if likert_score < 1
|
||||||
|
likert_score = likert_score.round(2)
|
||||||
|
end
|
||||||
|
|
||||||
|
output = []
|
||||||
|
output << raw_likert_score
|
||||||
|
output << likert_score
|
||||||
|
output << 'a-exp-i1'
|
||||||
|
output << range
|
||||||
|
output << items
|
||||||
|
output = output.flatten
|
||||||
|
csv << output
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
in_field_index = header_hash['Percent Teaching In-Field']
|
||||||
|
|
||||||
|
CSV.open(filepath, 'a') do |csv|
|
||||||
|
table.each do |row|
|
||||||
|
items = row.css('td').map(&:text)
|
||||||
|
dese_id = items[dese_id_index].to_i
|
||||||
|
next if dese_id.nil? || dese_id.zero?
|
||||||
|
|
||||||
|
percent_in_field = items[in_field_index].to_f if in_field_index.present?
|
||||||
|
if in_field_index.present? && percent_in_field.present? && !percent_in_field.zero?
|
||||||
|
raw_likert_score = percent_in_field * 4 / 95
|
||||||
|
end
|
||||||
|
raw_likert_score ||= 'NA'
|
||||||
|
likert_score = raw_likert_score
|
||||||
|
if likert_score != 'NA'
|
||||||
|
likert_score = 5 if likert_score > 5
|
||||||
|
likert_score = 1 if likert_score < 1
|
||||||
|
likert_score = likert_score.round(2)
|
||||||
|
end
|
||||||
|
|
||||||
|
output = []
|
||||||
|
output << raw_likert_score
|
||||||
|
output << likert_score
|
||||||
|
output << 'a-exp-i3'
|
||||||
|
output << range
|
||||||
|
output << items
|
||||||
|
output = output.flatten
|
||||||
|
csv << output
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def calculate(cells:)
|
||||||
|
cells[5].to_f * 4 / 95
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,76 @@
|
|||||||
|
require 'rails_helper'
|
||||||
|
require 'fileutils'
|
||||||
|
require 'csv'
|
||||||
|
|
||||||
|
RSpec.describe Dese::OneAScraper do
|
||||||
|
let(:academic_years) do
|
||||||
|
[
|
||||||
|
create(:academic_year, range: '2021-22'),
|
||||||
|
create(:academic_year, range: '2020-21')
|
||||||
|
# create(:academic_year, range: '2019-20'),
|
||||||
|
# create(:academic_year, range: '2018-19'),
|
||||||
|
# create(:academic_year, range: '2017-18'),
|
||||||
|
# create(:academic_year, range: '2016-17')
|
||||||
|
]
|
||||||
|
end
|
||||||
|
let(:filepath) { Rails.root.join('tmp', 'spec', 'dese', 'one_a.csv') }
|
||||||
|
before do
|
||||||
|
FileUtils.mkdir_p 'tmp/spec/dese'
|
||||||
|
end
|
||||||
|
|
||||||
|
before :each do
|
||||||
|
academic_years
|
||||||
|
end
|
||||||
|
|
||||||
|
xcontext 'Creating a new Scraper' do
|
||||||
|
it 'creates a csv file with the scraped data' do
|
||||||
|
Dese::OneAScraper.new(filepath:)
|
||||||
|
expect(filepath).to exist
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'has the correct headers' do
|
||||||
|
headers = File.open(filepath) do |file|
|
||||||
|
headers = file.first
|
||||||
|
end.split(',')
|
||||||
|
expect(headers).to eq ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year',
|
||||||
|
'School Name', 'DESE ID', 'Total # of Teachers(FTE)', 'Percent of Teachers Licensed',
|
||||||
|
'Student/Teacher Ratio', 'Percent of Experienced Teachers',
|
||||||
|
'Percent of Teachers without Waiver or Provisional License', "Percent Teaching in-field\n"]
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'has the right likert score results for a-exp-i1' do
|
||||||
|
results = CSV.parse(File.read(filepath), headers: true).map do |row|
|
||||||
|
row['Likert Score'].to_f
|
||||||
|
end
|
||||||
|
|
||||||
|
expect(results.take(20)).to eq [3.7, 4, 4.31, 3.95, 3.99, 3.69, 1.69,
|
||||||
|
4.5, 4.21, 4.1, 5, 4.2, 4.51, 3.97, 4.35,
|
||||||
|
4.38, 4.08, 4, 4.12, 5]
|
||||||
|
end
|
||||||
|
it 'has the right likert score results for a-exp-i3' do
|
||||||
|
results = CSV.parse(File.read(filepath), headers: true).map do |row|
|
||||||
|
next unless row['Admin Data Item'] == 'a-exp-i3' && row['Academic Year'] == '2020-21'
|
||||||
|
|
||||||
|
row['Likert Score'].to_f
|
||||||
|
end.flatten.compact
|
||||||
|
|
||||||
|
expect(results.take(20)).to eq [3.68, 4.21, 4.01, 3.73, 4.21, 4.21, 2.48, 4.06, 4.21, 4.21, 4.21, 4.21, 4.21,
|
||||||
|
4.21, 4.21, 4.1, 4, 4.07, 3.32, 4.21]
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'has the latest academic_year' do
|
||||||
|
academic_year = CSV.parse(File.read(filepath), headers: true).map do |row|
|
||||||
|
break row['Academic Year']
|
||||||
|
end
|
||||||
|
|
||||||
|
expect(academic_year).to eq '2021-22'
|
||||||
|
end
|
||||||
|
it 'has both admin data items in the file' do
|
||||||
|
results = CSV.parse(File.read(filepath), headers: true).map do |row|
|
||||||
|
row['Admin Data Item']
|
||||||
|
end
|
||||||
|
|
||||||
|
expect(results.uniq).to eq %w[a-exp-i1 a-exp-i3]
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
Loading…
Reference in new issue