sqm-dashboards/app/services/dese/three_b_one.rb

require "watir"

module Dese
  class ThreeBOne
    include Dese::Scraper
    attr_reader :filepaths

    def initialize(filepaths: [Rails.root.join("data", "admin_data", "dese", "3B_1_masscore.csv"),
                               Rails.root.join("data", "admin_data", "dese", "3B_1_advcoursecomprate.csv"),
                               Rails.root.join("data", "admin_data", "dese", "3B_1_ap.csv"),
                               Rails.root.join("data", "admin_data", "dese", "3B_1_adv_courses.csv"),
                               Rails.root.join("data", "admin_data", "dese", "3B_1_course_ratio.csv"),
                               Rails.root.join("data" , "admin_data", "dese", "3B_1_enrollments_by_race.csv") ,
                               Rails.root.join("data" , "admin_data", "dese", "3B_1_enrollments_by_grade.csv") ,
                               Rails.root.join("data" , "admin_data", "dese", "3B_1_adv_courses_white_students.csv") ])
      @filepaths = filepaths
    end

    def run_all
      filepath = filepaths[0]
      headers = ["Raw likert calculation", "Likert Score", "Admin Data Item", "Academic Year", "School Name", "DESE ID",
                 "# Graduated", "# Completed MassCore", "% Completed MassCore"]
      write_headers(filepath:, headers:)

      run_a_curv_i1(filepath:)

      filepath = filepaths[1]
      headers = ["Raw likert calculation", "Likert Score", "Admin Data Item", "Academic Year", "School Name", "DESE ID",
                 "# Grade 11 and 12 Students", "# Students Completing Advanced", "% Students Completing Advanced",
                 "% ELA", "% Math", "% Science and Technology", "% Computer and Information Science",
                 "% History and Social Sciences", "% Arts", "% All Other Subjects", "% All Other Subjects"]
      write_headers(filepath:, headers:)
      run_a_curv_i2(filepath:)

      filepath = filepaths[2]
      headers = ["Raw likert calculation", "Likert Score", "Admin Data Item", "Academic Year", "School Name", "DESE ID",
                 "Tests Taken", "Score=1", "Score=2", "Score=3", "Score=4", "Score=5", "% Score 1-2", "% Score 3-5"]
      write_headers(filepath:, headers:)
      run_a_curv_i3(filepath:)

      filepath = filepaths[3]
      headers = ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID',
                 '# Grade 11 and 12 Students', '# Students Completing Advanced', '% Students Completing Advanced', '% ELA', '% Math', '% Science and Technology', '% Computer and Information Science', '% History and Social Sciences', '% Arts', '% All Other Subjects', 'Ch 74 Secondary Cooperative Program']
      write_headers(filepath:, headers:)
      run_a_curv_i4(filepath:)

      filepath = filepaths[4]
      headers = ["Raw likert calculation", "Likert Score", "Admin Data Item", "Academic Year", "School Name", "DESE ID",
                 "Total # of Classes", "Average Class Size", "Number of Students", "Female %", "Male %", "English Language Learner %", "Students with Disabilities %", "Low Income %"]
      write_headers(filepath:, headers:)
      run_a_curv_i5(filepath:)

      browser.close
    end

    # We don't need to check to see if this is a high school because the link only lists relevant schools
    def run_a_curv_i1(filepath:)
      run do |academic_year|
        url = "https://profiles.doe.mass.edu/statereport/masscore.aspx"
        range = academic_year.range
        selectors = { "ctl00_ContentPlaceHolder1_ddReportType" => "School",
                      "ctl00_ContentPlaceHolder1_ddYear" => range }
        submit_id = "btnViewReport"
        calculation = lambda { |headers, items|
          completed_index = headers["% Completed MassCore"]
          percent_completed = items[completed_index].to_f
          benchmark = 90
          percent_completed * 4 / benchmark if completed_index.present? && !items[completed_index] != ""
        }
        admin_data_item_id = "a-curv-i1"
        Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
      end
    end

    # We don't need to check to see if this is a high school because the link only lists relevant schools
    def run_a_curv_i2(filepath:)
      run do |academic_year|
        url = "https://profiles.doe.mass.edu/statereport/advcoursecomprate.aspx"
        range = "#{academic_year.range.split('-')[1].to_i + 2000}"
        selectors = { "ctl00_ContentPlaceHolder1_ddReportType" => "School",
                      "ctl00_ContentPlaceHolder1_ddYear" => range }
        submit_id = "btnViewReport"
        calculation = lambda { |headers, items|
          completed_index = headers["% Students Completing Advanced"]
          percent_completed = items[completed_index].to_f
          benchmark = 30
          percent_completed * 4 / benchmark if completed_index.present? && !items[completed_index] != ""
        }
        admin_data_item_id = "a-curv-i2"
        Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
      end
    end

    # We don't need to check to see if this is a high school because the link only lists relevant schools
    def run_a_curv_i3(filepath:)
      run do |academic_year|
        url = "https://profiles.doe.mass.edu/statereport/ap.aspx"
        range = academic_year.range
        selectors = { "ctl00_ContentPlaceHolder1_ddReportType" => "School",
                      "ctl00_ContentPlaceHolder1_ddYear" => range }
        submit_id = "ctl00_ContentPlaceHolder1_btnViewReport"
        calculation = lambda { |headers, items|
          completed_index = headers["% Score 3-5"]
          percent_score = items[completed_index].to_f
          benchmark = 20
          percent_score * 4 / benchmark if completed_index.present? && !items[completed_index] != ""
        }
        admin_data_item_id = "a-curv-i3"
        Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
      end
    end

    def scrape_enrollments_by_race(filepath:)
      headers = ["Raw likert calculation", "Likert Score", "Admin Data Item", "Academic Year",  "School Name", "DESE ID",
                 "American Indian or Alaska Native", "Asian", "Black or African American", "Hispanic or Latino", "Multi-Race, Not Hispanic or Latino",
                 "Native Hawaiian or Other Pacific Islander", "White", "Female",
                 "Male", "Nonbinary"]
      write_headers(filepath:, headers:)
      run do |academic_year|
        admin_data_item_id = ''
        url = 'https://profiles.doe.mass.edu/statereport/enrollmentbyracegender.aspx'
        range = "#{academic_year.range.split('-')[1].to_i + 2000}"
        selectors = { 'ctl00_ContentPlaceHolder1_ddReportType' => 'School',
                      'ctl00_ContentPlaceHolder1_ddYear' => range }
        submit_id = 'btnViewReport'
        calculation = ->(_headers, _items) { 'NA' }
        Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
      end
    end

    def non_white_student_percentage
      @non_white_student_percentage ||= {}
      if @non_white_student_percentage.count == 0
        CSV.parse(File.read(filepaths[5]), headers: true).map do |row|
          academic_year = row['Academic Year']
          school_id = row['DESE ID'].to_i
          white = row['White'].gsub(',', '').to_i
          @non_white_student_percentage[[school_id, academic_year]] = 100 - white
        end
      end
      @non_white_student_percentage
    end

    def scrape_enrollments_by_grade(filepath:)
      headers = ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID',
                 'PK', 'K', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', 'SP', 'Total']
      write_headers(filepath:, headers:)
      run do |academic_year|
        admin_data_item_id = ''
        url = 'https://profiles.doe.mass.edu/statereport/enrollmentbygrade.aspx'
        range = academic_year.range
        selectors = { 'ctl00_ContentPlaceHolder1_ddReportType' => 'School',
                      'ctl00_ContentPlaceHolder1_ddYear' => range }
        submit_id = 'btnViewReport'
        calculation = ->(_headers, _items) { 'NA' }
        Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
      end
    end

    def eleventh_and_twelfth_grade_student_count
      @eleventh_and_twelfth_grade_student_count ||= {}
      if @eleventh_and_twelfth_grade_student_count.count == 0
        CSV.parse(File.read(filepaths[6]), headers: true).map do |row|
          academic_year = row['Academic Year']
          school_id = row['DESE ID'].to_i
          eleventh = row['11'].gsub(',', '').to_i
          twelfth = row['12'].gsub(',', '').to_i
          @eleventh_and_twelfth_grade_student_count[[school_id, academic_year]] = eleventh + twelfth
        end
      end
      @eleventh_and_twelfth_grade_student_count
    end

    def scrape_advanced_courses_for_white_students(filepath:)
      headers = ['Raw likert calculation', 'Likert Score', 'Admin Data Item', 'Academic Year', 'School Name', 'DESE ID',
                 '# Grade 11 and 12 Students', '# Students Completing Advanced', '% Students Completing Advanced', '% ELA', '% Math', '% Science and Technology', '% Computer and Information Science', '% History and Social Sciences', '% Arts', '% All Other Subjects', 'Ch 74 Secondary Cooperative Program']
      write_headers(filepath:, headers:)

      run do |academic_year|
        url = "https://profiles.doe.mass.edu/statereport/advcoursecomprate.aspx"
        range = "#{academic_year.range.split('-')[1].to_i + 2000}"
        selectors = { "ctl00_ContentPlaceHolder1_ddReportType" => "School",
                      "ctl00_ContentPlaceHolder1_ddYear" => range,
                      "ctl00_ContentPlaceHolder1_ddSubgroup" => "White"}
        submit_id = "btnViewReport"
        calculation = lambda { |headers, items| 'NA' }
        admin_data_item_id = "a-curv-i4"
        Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
      end
    end

    def white_students_in_advanced_courses
      @white_students_in_advanced_courses ||= {}
      if @white_students_in_advanced_courses  .count == 0
        CSV.parse(File.read(filepaths[7]), headers: true).map do |row|
          academic_year = row['Academic Year']
          school_id = row['DESE ID'].to_i
          total_num_students_in_adv_courses = row["# Grade 11 and 12 Students"].to_f
          num_completing_adv_courses = row["# Students Completing Advanced"].to_f

          @white_students_in_advanced_courses[[school_id, academic_year]] = total_num_students_in_adv_courses
        end
      end
      @white_students_in_advanced_courses
    end

    # We don't need to check to see if this is a high school because the link only lists relevant schools
    def run_a_curv_i4(filepath:)
      scrape_enrollments_by_race(filepath: filepaths[5])
      scrape_enrollments_by_grade(filepath: filepaths[6])
      scrape_advanced_courses_for_white_students(filepath: filepaths[7])

      run do |academic_year|
        url = "https://profiles.doe.mass.edu/statereport/advcoursecomprate.aspx"
        range = "#{academic_year.range.split('-')[1].to_i + 2000}"
        selectors = { "ctl00_ContentPlaceHolder1_ddReportType" => "School",
                      "ctl00_ContentPlaceHolder1_ddYear" => range }
        submit_id = "btnViewReport"
        calculation = lambda { |headers, items|
          school_id_index = headers["School Code"]
          school_id = items[school_id_index].to_i
          school_name_index = headers["School Name"]
          school_name = items[school_name_index]
          year = academic_year.range

          total_num_students_in_adv_index = headers["# Grade 11 and 12 Students"]
          total_num_students_in_adv_courses = items[total_num_students_in_adv_index].to_f
          return "NA" unless white_students_in_advanced_courses[[school_id, year]]
          non_white_students_in_adv_courses = total_num_students_in_adv_courses - white_students_in_advanced_courses[[school_id, year]]
          return "NA" unless eleventh_and_twelfth_grade_student_count[[school_id, year]]
          count_of_students_in_eleventh_and_twelfth_grade = eleventh_and_twelfth_grade_student_count[[school_id, year]]
          return "NA" unless non_white_student_percentage[[school_id, year]]
          percentage_non_white = non_white_student_percentage[[school_id, year]]
          enrollment_number_of_non_whites = percentage_non_white * count_of_students_in_eleventh_and_twelfth_grade
          percent_non_white_taking_adv_courses = non_white_students_in_adv_courses / count_of_students_in_eleventh_and_twelfth_grade * 100
          benchmark = 35
          percent_non_white_taking_adv_courses * 4 / benchmark
        }
        admin_data_item_id = "a-curv-i4"
        Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
      end
    end

    def run_a_curv_i5(filepath:)
      run do |academic_year|
        url = "https://profiles.doe.mass.edu/statereport/classsizebygenderpopulation.aspx"
        range = academic_year.range
        selectors = { "ctl00_ContentPlaceHolder1_ddReportType" => "School",
                      "ctl00_ContentPlaceHolder1_ddYear" => range }
        submit_id = "btnViewReport"
        calculation = lambda { |headers, items|
          school_id = items[headers["School Code"]].to_i
          school_name = items[headers["School Name"]]

          return "NA" unless is_hs?(school_id:, school_name:)

          classes_index = headers["Total # of Classes"]
          num_classes = items[classes_index].gsub(",", "").to_f
          students_index = headers["Number of Students"]
          num_students = items[students_index].gsub(",", "").to_f
          benchmark = 2.04

          ((benchmark - (num_students / num_classes)) + benchmark) * 4 / benchmark
        }
        admin_data_item_id = "a-curv-i5"
        Prerequisites.new(filepath, url, selectors, submit_id, admin_data_item_id, calculation)
      end
    end
  end
end