Remove 'scraper' from file/class names add 3B-1

This commit is contained in:
rebuilt 2022-09-12 11:24:57 -07:00
parent 83ef9310a4
commit 3e28be2d0d
28 changed files with 35252 additions and 27790 deletions

View file

@ -2,7 +2,7 @@ require 'watir'
require 'csv'
module Dese
class FourDScraper
class FourDOne
def initialize(filepath: Rails.root.join('data', 'admin_data', 'dese', 'four_d.csv'))
url = 'https://profiles.doe.mass.edu/statereport/plansofhsgrads.aspx'
browser = Watir::Browser.new
@ -24,7 +24,7 @@ module Dese
browser.select(id: 'ctl00_ContentPlaceHolder1_ddReportType').select(/School/)
browser.select(id: 'ctl00_ContentPlaceHolder1_ddYear').select(text: range)
browser.button(id: 'btnViewReport').click
sleep 2 # Sleep to prevent hitting mass.edu with too many requests
sleep Dese::Scraper::DELAY # Sleep to prevent hitting mass.edu with too many requests
document = Nokogiri::HTML(browser.html)
document.css('tr')
end

View file

@ -1,7 +1,7 @@
require 'csv'
module Dese
class FourDLoader
class Loader
def self.load_data(filepath:)
CSV.parse(File.read(filepath), headers: true) do |row|
score = likert_score(row:)

View file

@ -2,8 +2,8 @@ require 'watir'
require 'csv'
module Dese
class OneAScraper
def initialize(filepath: Rails.root.join('data', 'admin_data', 'dese', 'one_a.csv'))
class OneAOne
def initialize(filepath: Rails.root.join('data', 'admin_data', 'dese', 'one_a_one_teacher_data.csv'))
url = 'https://profiles.doe.mass.edu/statereport/teacherdata.aspx'
browser = Watir::Browser.new
write_headers(filepath:)
@ -25,7 +25,7 @@ module Dese
browser.select(id: 'ctl00_ContentPlaceHolder1_ddReportType').select(text: 'School')
browser.select(id: 'ctl00_ContentPlaceHolder1_ddYear').select(text: range)
browser.button(id: 'ctl00_ContentPlaceHolder1_btnViewReport').click
sleep 3 # Sleep to prevent hitting mass.edu with too many requests
sleep Dese::Scraper::DELAY # Sleep to prevent hitting mass.edu with too many requests
Nokogiri::HTML(browser.html)
end

View file

@ -2,16 +2,16 @@ require 'watir'
require 'csv'
module Dese
class OneAThreeScraper
class OneAThree
include Dese::Scraper
attr_reader :filepaths
Prerequisites = Struct.new('Prerequisites', :filepath, :url, :selectors, :submit_id, :admin_data_item_id,
:calculation)
def initialize(filepaths: [Rails.root.join('data', 'admin_data', 'dese', 'one_a_three.csv'),
def initialize(filepaths: [Rails.root.join('data', 'admin_data', 'dese', 'one_a_three_staffing_retention.csv'),
Rails.root.join('data', 'admin_data', 'dese', 'one_a_three_teachers_of_color.csv')])
@filepaths = filepaths
end
def run_all
run_a_pcom_i1
run_a_pcom_i3
@ -64,76 +64,5 @@ module Dese
Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
end
end
def run
academic_years = AcademicYear.all
academic_years.each do |academic_year|
prerequisites = yield academic_year
document = get_html(url: prerequisites.url,
selectors: prerequisites.selectors,
submit_id: prerequisites.submit_id)
unless document.nil?
write_csv(document:, filepath: prerequisites.filepath, range: academic_year.range, id: prerequisites.admin_data_item_id,
calculation: prerequisites.calculation)
end
end
end
def browser
@browser ||= Watir::Browser.new
end
def get_html(url:, selectors:, submit_id:)
browser.goto(url)
selectors.each do |key, value|
return unless browser.option(text: value).present?
browser.select(id: key).select(text: value)
end
browser.button(id: submit_id).click
sleep 2 # Sleep to prevent hitting mass.edu with too many requests
Nokogiri::HTML(browser.html)
end
def write_headers(filepath:, headers:)
CSV.open(filepath, 'w') do |csv|
csv << headers
end
end
def write_csv(document:, filepath:, range:, id:, calculation:)
table = document.css('tr')
headers = document.css('.sorting')
header_hash = headers.each_with_index.map { |header, index| [header.text, index] }.to_h
CSV.open(filepath, 'a') do |csv|
table.each do |row|
items = row.css('td').map(&:text)
dese_id = items[1].to_i
next if dese_id.nil? || dese_id.zero?
raw_likert_score = calculation.call(header_hash, items)
raw_likert_score ||= 'NA'
likert_score = raw_likert_score
if likert_score != 'NA'
likert_score = 5 if likert_score > 5
likert_score = 1 if likert_score < 1
likert_score = likert_score.round(2)
end
output = []
output << raw_likert_score
output << likert_score
output << id
output << range
output << items
output = output.flatten
csv << output
end
end
end
end
end

View file

@ -1,6 +1,9 @@
module Dese
module Scraper
DELAY = 3
DELAY = 20
Prerequisites = Struct.new('Prerequisites', :filepath, :url, :selectors, :submit_id, :admin_data_item_id,
:calculation)
def reverse_score(likert_score:)
return nil unless likert_score.present?

View file

@ -6,12 +6,11 @@ module Dese
include Dese::Scraper
attr_reader :filepaths
Prerequisites = Struct.new('Prerequisites', :filepath, :url, :selectors, :submit_id, :admin_data_item_id,
:calculation)
def initialize(filepaths: [Rails.root.join('data', 'admin_data', 'dese', 'two_c_one_attendance.csv')])
def initialize(filepaths: [Rails.root.join('data', 'admin_data', 'dese', 'three_a_one_gender_population.csv')])
@filepaths = filepaths
end
def run_all
filepath = filepaths[0]
headers = ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID',
'Total # of Classes', 'Average Class Size', 'Number of Students', 'Female %', 'Male %',

View file

@ -0,0 +1,187 @@
require 'watir'
require 'csv'
module Dese
class ThreeATwo
include Dese::Scraper
attr_reader :filepaths
def initialize(filepaths: [Rails.root.join('data', 'admin_data', 'dese', 'enrollment.csv'),
Rails.root.join('data', 'admin_data', 'dese', 'three_a_two_age_staffing.csv'),
Rails.root.join('data', 'admin_data', 'dese', 'three_a_two_grade_subject_staffing.csv')])
@filepaths = filepaths
end
def run_all
filepath = filepaths[0]
scrape_enrollments(filepath:)
filepath = filepaths[1]
write_a_sust_i1_headers(filepath:)
run_a_sust_i1(filepath:)
run_a_sust_i2(filepath:)
run_a_sust_i3(filepath:)
filepath = filepaths[2]
write_a_sust_i4_headers(filepath:)
run_a_sust_i4(filepath:)
browser.close
end
def write_a_sust_i1_headers(filepath:)
headers = ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID',
'<26 yrs (# )', '26-32 yrs (#)', '33-40 yrs (#)', '41-48 yrs (#)',
'49-56 yrs (#)', '57-64 yrs (#)', 'Over 64 yrs (#)', 'FTE Count',
'Student Count', 'Student to Guidance Counselor ratio']
write_headers(filepath:, headers:)
end
def write_a_sust_i4_headers(filepath:)
headers = ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID',
'PK-2 (# )', '3-5 (# )', '6-8 (# )', '9-12 (# )', 'Multiple Grades (# )', 'All Grades (# )', 'FTE Count',
'Student Count', 'Student to Art Teacher ratio']
write_headers(filepath:, headers:)
end
def scrape_enrollments(filepath:)
headers = ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID',
'PK', 'K', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', 'SP', 'Total']
write_headers(filepath:, headers:)
run do |academic_year|
admin_data_item_id = ''
url = 'https://profiles.doe.mass.edu/statereport/enrollmentbygrade.aspx'
range = academic_year.range
selectors = { 'ctl00_ContentPlaceHolder1_ddReportType' => 'School',
'ctl00_ContentPlaceHolder1_ddYear' => range }
submit_id = 'btnViewReport'
calculation = ->(_headers, _items) { 'NA' }
Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
end
end
def student_count(filepath:, dese_id:, year:)
@students ||= {}
if @students.count == 0
CSV.parse(File.read(filepath), headers: true).map do |row|
academic_year = row['Academic Year']
school_id = row['DESE ID'].to_i
total = row['Total'].gsub(',', '').to_i
@students[[school_id, academic_year]] = total
end
end
@students[[dese_id, year]]
end
def run_a_sust_i1(filepath:)
run do |academic_year|
admin_data_item_id = 'a-sust-i1'
url = 'https://profiles.doe.mass.edu/statereport/agestaffing.aspx'
range = academic_year.range
selectors = { 'ctl00_ContentPlaceHolder1_ddReportType' => 'School',
'ctl00_ContentPlaceHolder1_ddYear' => range,
'ctl00_ContentPlaceHolder1_ddJobClassification' => 'Guidance Counselor' }
submit_id = 'btnViewReport'
calculation = lambda { |headers, items|
fte_index = headers['FTE Count']
num_of_guidance_counselors = items[fte_index].to_f
dese_id = items[headers['School Code']].to_i
num_of_students = student_count(filepath: filepaths[0], dese_id:, year: academic_year.range) || 0
items << num_of_students
benchmark = 250
if fte_index.present? && !items[fte_index] != ''
result = ((benchmark - (num_of_students / num_of_guidance_counselors)) + benchmark) * 4 / benchmark
end
items << (num_of_students / num_of_guidance_counselors)
result
}
Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
end
end
def run_a_sust_i2(filepath:)
run do |academic_year|
admin_data_item_id = 'a-sust-i2'
url = 'https://profiles.doe.mass.edu/statereport/agestaffing.aspx'
range = academic_year.range
selectors = { 'ctl00_ContentPlaceHolder1_ddReportType' => 'School',
'ctl00_ContentPlaceHolder1_ddYear' => range,
'ctl00_ContentPlaceHolder1_ddJobClassification' => 'School Psychologist -- Non-Special Education' }
submit_id = 'btnViewReport'
calculation = lambda { |headers, items|
fte_index = headers['FTE Count']
num_of_psychologists = items[fte_index].to_f
dese_id = items[headers['School Code']].to_i
num_of_students = student_count(filepath: filepaths[0], dese_id:, year: academic_year.range) || 0
items << num_of_students
benchmark = 250
if fte_index.present? && !items[fte_index] != ''
result = ((benchmark - (num_of_students / num_of_psychologists)) + benchmark) * 4 / benchmark
end
items << (num_of_students / num_of_psychologists)
result
}
Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
end
end
def run_a_sust_i3(filepath:)
run do |academic_year|
admin_data_item_id = 'a-sust-i3'
url = 'https://profiles.doe.mass.edu/statereport/agestaffing.aspx'
range = academic_year.range
selectors = { 'ctl00_ContentPlaceHolder1_ddReportType' => 'School',
'ctl00_ContentPlaceHolder1_ddYear' => range,
'ctl00_ContentPlaceHolder1_ddJobClassification' => 'Paraprofessional' }
submit_id = 'btnViewReport'
calculation = lambda { |headers, items|
fte_index = headers['FTE Count']
num_of_paraprofessionals = items[fte_index].to_f
dese_id = items[headers['School Code']].to_i
num_of_students = student_count(filepath: filepaths[0], dese_id:, year: academic_year.range) || 0
items << num_of_students
benchmark = 43.4
if fte_index.present? && !items[fte_index] != ''
result = ((benchmark - (num_of_students / num_of_paraprofessionals)) + benchmark) * 4 / benchmark
end
items << (num_of_students / num_of_paraprofessionals)
result
}
Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
end
end
def run_a_sust_i4(filepath:)
run do |academic_year|
admin_data_item_id = 'a-sust-i4'
url = 'https://profiles.doe.mass.edu/state_report/gradesubjectstaffing.aspx'
range = academic_year.range
selectors = {
'ctl00_ContentPlaceHolder1_reportType' => 'School',
'ctl00_ContentPlaceHolder1_fyCode' => range,
'ctl00_ContentPlaceHolder1_subjectCode' => 'Arts'
}
submit_id = 'ctl00_ContentPlaceHolder1_Continue'
calculation = lambda { |_headers, items|
num_of_art_teachers = items.last.to_f
dese_id = items[1].to_i
num_of_students = student_count(filepath: filepaths[0], dese_id:, year: academic_year.range) || 0
items << num_of_students
benchmark = 500
if num_of_art_teachers.present?
result = ((benchmark - (num_of_students / num_of_art_teachers)) + benchmark) * 4 / benchmark
end
items << (num_of_students / num_of_art_teachers)
result
}
Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
end
end
end
end

View file

@ -0,0 +1,94 @@
require 'watir'
require 'csv'
module Dese
class ThreeBOne
include Dese::Scraper
attr_reader :filepaths
def initialize(filepaths: [Rails.root.join('data', 'admin_data', 'dese', 'three_b_one_masscore.csv'),
Rails.root.join('data', 'admin_data', 'dese', 'three_b_one_advcoursecomprate.csv'),
Rails.root.join('data', 'admin_data', 'dese', 'three_b_one_ap.csv')])
@filepaths = filepaths
end
def run_all
filepath = filepaths[0]
headers = ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID',
'# Graduated', '# Completed MassCore', '% Completed MassCore']
write_headers(filepath:, headers:)
run_a_curv_i1(filepath:)
filepath = filepaths[1]
headers = ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID',
'# Grade 11 and 12 Students', '# Students Completing Advanced', '% Students Completing Advanced',
'% ELA', '% Math', '% Science and Technology', '% Computer and Information Science',
'% History and Social Sciences', '% Arts', '% All Other Subjects', '% All Other Subjects']
write_headers(filepath:, headers:)
run_a_curv_i2(filepath:)
filepath = filepaths[2]
headers = ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID',
'Tests Taken', 'Score=1', 'Score=2', 'Score=3', 'Score=4', 'Score=5', '% Score 1-2', '% Score 3-5']
write_headers(filepath:, headers:)
run_a_curv_i3(filepath:)
browser.close
end
def run_a_curv_i1(filepath:)
run do |academic_year|
url = 'https://profiles.doe.mass.edu/statereport/masscore.aspx'
range = academic_year.range
selectors = { 'ctl00_ContentPlaceHolder1_ddReportType' => 'School',
'ctl00_ContentPlaceHolder1_ddYear' => range }
submit_id = 'btnViewReport'
calculation = lambda { |headers, items|
completed_index = headers['% Completed MassCore']
percent_completed = items[completed_index].to_f
benchmark = 90
percent_completed * 4 / benchmark if completed_index.present? && !items[completed_index] != ''
}
admin_data_item_id = 'a-curv-i1'
Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
end
end
def run_a_curv_i2(filepath:)
run do |academic_year|
url = 'https://profiles.doe.mass.edu/statereport/advcoursecomprate.aspx'
range = "#{academic_year.range.split('-')[1].to_i + 2000}"
selectors = { 'ctl00_ContentPlaceHolder1_ddReportType' => 'School',
'ctl00_ContentPlaceHolder1_ddYear' => range }
submit_id = 'btnViewReport'
calculation = lambda { |headers, items|
completed_index = headers['% Students Completing Advanced']
percent_completed = items[completed_index].to_f
benchmark = 30
percent_completed * 4 / benchmark if completed_index.present? && !items[completed_index] != ''
}
admin_data_item_id = 'a-curv-i2'
Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
end
end
def run_a_curv_i3(filepath:)
run do |academic_year|
url = 'https://profiles.doe.mass.edu/statereport/ap.aspx'
range = academic_year.range
selectors = { 'ctl00_ContentPlaceHolder1_ddReportType' => 'School',
'ctl00_ContentPlaceHolder1_ddYear' => range }
submit_id = 'ctl00_ContentPlaceHolder1_btnViewReport'
calculation = lambda { |headers, items|
completed_index = headers['% Score 3-5']
percent_score = items[completed_index].to_f
benchmark = 20
percent_score * 4 / benchmark if completed_index.present? && !items[completed_index] != ''
}
admin_data_item_id = 'a-curv-i3'
Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
end
end
end
end

View file

@ -2,17 +2,16 @@ require 'watir'
require 'csv'
module Dese
class TwoAOneScraper
class TwoAOne
include Dese::Scraper
attr_reader :filepaths
Prerequisites = Struct.new('Prerequisites', :filepath, :url, :selectors, :submit_id, :admin_data_item_id,
:calculation)
def initialize(filepaths: [Rails.root.join('data', 'admin_data', 'dese', 'two_a_one_students_suspended.csv'),
Rails.root.join('data', 'admin_data', 'dese', 'two_a_one_students_disciplined.csv')])
@filepaths = filepaths
end
def run_all
run_a_phys_i1
run_a_phys_i3

View file

@ -2,26 +2,28 @@ require 'watir'
require 'csv'
module Dese
class TwoCOneScraper
class TwoCOne
include Dese::Scraper
attr_reader :filepaths
Prerequisites = Struct.new('Prerequisites', :filepath, :url, :selectors, :submit_id, :admin_data_item_id,
:calculation)
def initialize(filepaths: [Rails.root.join('data', 'admin_data', 'dese', 'two_c_one_attendance.csv')])
@filepaths = filepaths
end
def run_all
write_a_vale_i1_headers
run_a_vale_i1
run_a_vale_i2
browser.close
end
def write_a_vale_i1_headers
filepath = filepaths[0]
headers = ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID',
'Attendance Rate', 'Average # of Absences', 'Absent 10 or more days', 'Chronically Absent (10% or more)',
'Chronically Absent (20% or more)', 'Unexcused > 9 days']
write_headers(filepath:, headers:)
run_a_vale_i1
run_a_vale_i2
browser.close
end
def run_a_vale_i1