sqm-dashboards/app/services/cleaner.rb

require "fileutils"
class Cleaner
  attr_reader :input_filepath, :output_filepath, :log_filepath

  def initialize(input_filepath:, output_filepath:, log_filepath:)
    @input_filepath = input_filepath
    @output_filepath = output_filepath
    @log_filepath = log_filepath
    initialize_directories
  end

  def clean
    Dir.glob(Rails.root.join(input_filepath, "*.csv")).each do |filepath|
      puts filepath
      File.open(filepath) do |file|
        processed_data = process_raw_file(file:)
        processed_data in [headers, clean_csv, log_csv, data]
        return if data.empty?

        filename = filename(headers:, data:, filepath:)
        write_csv(data: clean_csv, output_filepath:, filename:)
        write_csv(data: log_csv, output_filepath: log_filepath, prefix: "removed.", filename:)
      end
    end
  end

  def filename(headers:, data:, filepath:)
    output = []
    survey_item_ids = headers.filter(&:present?).filter do |header|
                        header.start_with?("s-", "t-", "p-")
                      end.reject { |item| item.end_with? "-1" }
    survey_type = SurveyItem.survey_type(survey_item_ids:)
    range = data.first.academic_year.range

    districts = data.map do |row|
      row.district.short_name
    end.to_set.to_a

    schools = data.map do |row|
      row.school.name
    end.to_set

    part = filepath&.match(/[\b\s_.]+(part)[\W*_](?<label>[\w\d])/i)&.named_captures&.[]("label")&.upcase

    school_name = schools.first.parameterize

    output << districts.sort.join(".")
    output << school_name if schools.length == 1
    output << survey_type.to_s
    output << "Part-" + part unless part.nil?
    output << range.parameterize
    output << "csv"
    output.join(".")
  end

  def process_raw_file(file:)
    clean_csv = []
    log_csv = []
    data = []
    headers = CSV.parse(file.first).first

    # If this is a student survey
    # Make sure it includes a 'Grade' header
    is_student_survey = headers.filter(&:present?)
                               .filter { |header| header.start_with? "s-" }
                               .count > 0

    has_grade_header = headers.filter(&:present?).find { |header| header.match?(/grade/i) }.present?
    if is_student_survey && has_grade_header == false
      puts "could not find the Grade header.  Stopping execution"
      exit
    end

    duplicate_header = headers.detect { |header| headers.count(header) > 1 }
    unless duplicate_header.nil?
      puts "\n>>>>>>>>>>>>>>>>>>    Duplicate header found.  This will misalign column headings.  Please delete or rename the duplicate column: #{duplicate_header} \n>>>>>>>>>>>>>> \n"
      exit
    end

    headers = headers.to_set
    headers = headers.merge(Set.new(["Raw Income", "Income", "Raw ELL", "ELL", "Raw SpEd", "SpEd", "Progress Count",
                                     "Race", "Gender", "Raw Housing Status", "Housing Status", "Home Language", "Home Languages"])).to_a
    filtered_headers = include_all_headers(headers:)
    filtered_headers = remove_unwanted_headers(headers: filtered_headers)
    log_headers = (filtered_headers + ["Valid Duration?", "Valid Progress?", "Valid Grade?",
                                       "Valid Standard Deviation?"]).flatten
    clean_csv << filtered_headers
    log_csv << log_headers

    all_survey_items = survey_items(headers:)

    file.lazy.each_slice(1000) do |lines|
      CSV.parse(lines.join, headers:).map do |row|
        values = SurveyItemValues.new(row:, headers:,
                                      survey_items: all_survey_items, schools:, academic_years:)
        unless values.valid_school?
          # puts "row #{values.response_id}: dese id :#{values.dese_id} : could not find school, skipping row for file #{file.path}"
          next
        end

        data << values
        values.valid? ? clean_csv << values.to_a : log_csv << (values.to_a << values.valid_duration?.to_s << values.valid_progress?.to_s << values.valid_grade?.to_s << values.valid_sd?.to_s)
      end
    end
    [headers, clean_csv, log_csv, data]
  end

  private

  def academic_years
    @academic_years ||= AcademicYear.all
  end

  def include_all_headers(headers:)
    alternates = headers.filter(&:present?)
                        .filter { |header| header.match?(/^[st]-\w*-\w*-1$/i) }
    alternates.each do |header|
      main = header.sub(/-1\z/, "")
      headers.push(main) unless headers.include?(main)
    end
    headers
  end

  def initialize_directories
    create_ouput_directory
    create_log_directory
  end

  def remove_unwanted_headers(headers:)
    headers.to_set.to_a.compact.reject do |item|
      item.start_with? "Q"
    end.reject { |header| header.match?(/^[stp]-\w*-\w*-1$/i) }
  end

  def write_csv(data:, output_filepath:, filename:, prefix: "")
    csv = CSV.generate do |csv|
      data.each do |row|
        csv << row
      end
    end
    File.write(output_filepath.join(prefix + filename), csv)
  end

  def schools
    @schools ||= School.by_dese_id
  end

  def genders
    @genders ||= Gender.by_qualtrics_code
  end

  def survey_items(headers:)
    survey_item_ids = headers
                      .filter(&:present?)
                      .filter { |header| header.start_with? "t-", "s-", "p-" }
    @survey_items ||= SurveyItem.where(survey_item_id: survey_item_ids)
  end

  def create_ouput_directory
    FileUtils.mkdir_p output_filepath
  end

  def create_log_directory
    FileUtils.mkdir_p log_filepath
  end
end