fix: ensure cleaner outputs columns for all survey items. Before the fix, if a survey item varient (ending in -1, ie s-tint-q1-1) did not have a matching survey item s-tint-q1, the resulting csv would not include that column

2 years ago · 7bd7923d41
parent 2c9df34fac
commit 7bd7923d41
8 changed files with 300 additions and 98 deletions
--- a/.ruby-version
+++ b/.ruby-version
@ -1 +1 @@
-3.2.1
+3.2.2
--- a/2
+++ b/2
@ -1,5 +1,5 @@
 source "https://rubygems.org"
-ruby "3.2.1"
+ruby "3.2.2"

 git_source(:github) do |repo_name|
  repo_name = "#{repo_name}/#{repo_name}" unless repo_name.include?("/")
--- a/Gemfile.lock
+++ b/Gemfile.lock
@ -506,7 +506,7 @@ DEPENDENCIES
  watir

 RUBY VERSION
-   ruby 3.2.1p31
+   ruby 3.2.2p53

 BUNDLED WITH
   2.3.3
--- a/app/services/cleaner.rb
+++ b/app/services/cleaner.rb
@ -1,4 +1,4 @@
-require 'fileutils'
+require "fileutils"
 class Cleaner
  attr_reader :input_filepath, :output_filepath, :log_filepath, :clean_csv, :log_csv

@ -15,17 +15,44 @@ class Cleaner
  end

  def clean
-    Dir.glob(Rails.root.join(input_filepath, '*.csv')).each do |filepath|
+    Dir.glob(Rails.root.join(input_filepath, "*.csv")).each do |filepath|
      puts filepath
-      File.open(filepath) do |file|
+      File.open(filepath) do |_file|
        clean_csv = []
        log_csv = []
        data = []

-        headers = CSV.parse(file.first).first
-        filtered_headers = remove_unwanted_headers(headers:)
-        log_headers = (filtered_headers + ['Valid Duration?', 'Valid Progress?', 'Valid Grade?',
-                                           'Valid Standard Deviation?']).flatten
+        filename = filename(headers:, data:)
+        write_csv(data: clean_csv, output_filepath:, filename:)
+        write_csv(data: log_csv, output_filepath: log_filepath, prefix: "removed.", filename:)
+      end
+    end
+  end
+
+  def filename(headers:, data:)
+    survey_item_ids = headers.filter(&:present?).filter do |header|
+                        header.start_with?("s-", "t-")
+                      end.reject { |item| item.end_with? "-1" }
+    survey_type = SurveyItem.survey_type(survey_item_ids:)
+    range = data.first.academic_year.range
+
+    districts = data.map do |row|
+      row.district.name
+    end.to_set.to_a
+
+    districts.join(".").to_s + "." + survey_type.to_s + "." + range + ".csv"
+  end
+
+  def process_raw_file(file:, disaggregation_data:)
+    clean_csv = []
+    log_csv = []
+    data = []
+
+    headers = (CSV.parse(file.first).first << "Raw Income") << "Income"
+    filtered_headers = include_all_headers(headers:)
+    filtered_headers = remove_unwanted_headers(headers: filtered_headers)
+    log_headers = (filtered_headers + ["Valid Duration?", "Valid Progress?", "Valid Grade?",
+                                       "Valid Standard Deviation?"]).flatten

    clean_csv << filtered_headers
    log_csv << log_headers
@ -35,44 +62,40 @@ class Cleaner
    file.lazy.each_slice(1000) do |lines|
      CSV.parse(lines.join, headers:).map do |row|
        values = SurveyItemValues.new(row:, headers:, genders:,
-                                          survey_items: all_survey_items, schools:)
+                                      survey_items: all_survey_items, schools:, disaggregation_data:)
        next unless values.valid_school?

        data << values
        values.valid? ? clean_csv << values.to_a : log_csv << (values.to_a << values.valid_duration?.to_s << values.valid_progress?.to_s << values.valid_grade?.to_s << values.valid_sd?.to_s)
      end
    end
-
-        unless data.empty?
-          filename = filename(headers:, data:)
-          write_csv(data: clean_csv, output_filepath:, filename:)
-          write_csv(data: log_csv, output_filepath: log_filepath, prefix: 'removed.', filename:)
-        end
-      end
-    end
+    [headers, clean_csv, log_csv, data]
  end

-  def filename(headers:, data:)
-    survey_item_ids = headers.filter(&:present?).filter do |header|
-                        header.start_with?('s-', 't-')
-                      end.reject { |item| item.end_with? '-1' }
-    survey_type = SurveyItem.survey_type(survey_item_ids:)
-    range = data.first.academic_year.range
+  private

-    districts = data.map do |row|
-      row.district.name
-    end.to_set.to_a
+  def include_all_headers(headers:)
+    alternates = headers.filter(&:present?)
+                        .filter { |header| header.end_with? "-1" }
+    alternates.each do |header|
+      main = header.sub(/-1\z/, "")
+      headers.push(main) unless headers.include?(main)
+    end
+    headers
+  end

-    districts.join('.').to_s + '.' + survey_type.to_s + '.' + range + '.csv'
+  def initialize_directories
+    create_ouput_directory
+    create_log_directory
  end

  def remove_unwanted_headers(headers:)
    headers.to_set.to_a.compact.reject do |item|
-      item.start_with? 'Q'
-    end.reject { |item| item.end_with? '-1' }
+      item.start_with? "Q"
+    end.reject { |item| item.end_with? "-1" }
  end

-  def write_csv(data:, output_filepath:, filename:, prefix: '')
+  def write_csv(data:, output_filepath:, filename:, prefix: "")
    csv = CSV.generate do |csv|
      data.each do |row|
        csv << row
@ -102,13 +125,10 @@ class Cleaner
  end

  def survey_items(headers:)
-    @survey_items ||= SurveyItem.where(survey_item_id: get_survey_item_ids_from_headers(headers:))
-  end
-
-  def get_survey_item_ids_from_headers(headers:)
-    headers
+    survey_item_ids = headers
                      .filter(&:present?)
-      .filter { |header| header.start_with? 't-', 's-' }
+                      .filter { |header| header.start_with? "t-", "s-" }
+    @survey_items ||= SurveyItem.where(survey_item_id: survey_item_ids)
  end

  def create_ouput_directory
--- a/app/services/survey_item_values.rb
+++ b/app/services/survey_item_values.rb
@ -7,6 +7,11 @@ class SurveyItemValues
    @genders = genders
    @survey_items = survey_items
    @schools = schools
+    @disaggregation_data = disaggregation_data
+
+    copy_likert_scores_from_variant_survey_items
+    row["Income"] = income
+    row["Raw Income"] = raw_income
  end

  # Some survey items have variants, i.e.  a survey item with an id of s-tint-q1 might have a variant that looks like s-tint-q1-1.  We must ensure that all variants in the form of s-tint-q1-1 have a matching pair.
@ -115,6 +120,8 @@ class SurveyItemValues

    return "Unknown" unless disaggregation_data.present?

+    byebug
+
    disaggregation = disaggregation_data[[lasid, district.name, academic_year.range]]
    return "Unknown" unless disaggregation.present?

@ -147,7 +154,6 @@ class SurveyItemValues
  end

  def to_a
-    copy_likert_scores_from_variant_survey_items
    headers.select(&:present?)
           .reject { |key, _value| key.start_with? "Q" }
           .reject { |key, _value| key.end_with? "-1" }
@ -238,7 +244,8 @@ class SurveyItemValues
    headers.filter(&:present?).filter { |header| header.end_with? "-1" }.each do |header|
      likert_score = row[header]
      main_item = header.gsub("-1", "")
-      row[main_item] = likert_score if likert_score.present?
+      row[main_item] = likert_score if likert_score.present? && row[main_item].blank?
    end
  end
 end
+
--- a/app/views/analyze/_grouped_bar_column.html.erb
+++ b/app/views/analyze/_grouped_bar_column.html.erb
@ -1,14 +1,6 @@
 <g class="grouped-bar-column" data-for-measure-id="<%= column.measure.measure_id %>">
 <% score_label_y = [5, 10, 15, 5, 10, 15 ] %>
  <% column.bars.each_with_index do |bar, index| %>
-<<<<<<< HEAD
-    <rect data-for-academic-year="<%= bar.academic_year.range %>" x="<%= bar.x_position %>%" y="<%= bar.y_offset %>%" width="<%= column.bar_width %>%" height="<%= bar.bar_height_percentage %>%" fill="<%= bar.color %>" />
-
-    <% if ENV["SCORES"].present?  && ENV["SCORES"].upcase == "SHOW" %>
-      <text x="<%= bar.x_position + (column.bar_width * 0.5) %>%" y="<%= score_label_y[index] %>%" text-anchor="middle" dominant-baseline="middle">
-        <%= bar.average %>
-      </text>
-=======
    <% if column.sufficient_data?(index)   %>
      <rect
      <% if column.show_popover? %>
@ -29,7 +21,6 @@
        </text>
      <% end %>

->>>>>>> 67e469a6 (feat: add popover to analyze graphs that displays the n-size of the different columns.  Make sure to only calculate a score for a race if there are more than 10 respondents to a question.)
    <% end %>
  <% end %>

--- a/spec/fixtures/raw/sample_maynard_raw_student_survey.csv
+++ b/spec/fixtures/raw/sample_maynard_raw_student_survey.csv
@ -0,0 +1,36 @@
+StartDate,EndDate,Status,IPAddress,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,District,School,LASID,Gender,Race,What grade are you in?,s-emsa-q1,s-emsa-q2,s-emsa-q3,s-tint-q1,s-tint-q2,s-tint-q3,s-tint-q4,s-tint-q5,s-acpr-q1,s-acpr-q2,s-acpr-q3,s-acpr-q4,s-cure-q1,s-cure-q2,s-cure-q3,s-cure-q4,s-sten-q1,s-sten-q2,s-sten-q3,s-sper-q1,s-sper-q2,s-sper-q3,s-sper-q4,s-civp-q1,s-civp-q2,s-civp-q3,s-civp-q4,s-grmi-q1,s-grmi-q2,s-grmi-q3,s-grmi-q4,s-appa-q1,s-appa-q2,s-appa-q3,s-peff-q1,s-peff-q2,s-peff-q3,s-peff-q4,s-peff-q5,s-sbel-q1,s-sbel-q2,s-sbel-q3,s-sbel-q4,s-sbel-q5,s-phys-q1,s-phys-q2,s-phys-q3,s-phys-q4,s-vale-q1,s-vale-q2,s-vale-q3,s-vale-q4,s-acst-q1,s-acst-q2,s-acst-q3,s-sust-q1,s-sust-q2,s-grit-q1,s-grit-q2,s-grit-q3,s-grit-q4,s-expa-q1,s-poaf-q1,s-poaf-q2,s-poaf-q3,s-poaf-q4,s-tint-q1-1,s-tint-q2-1,s-tint-q3-1,s-tint-q4-1,s-tint-q5-1,s-acpr-q1-1,s-acpr-q2-1,s-acpr-q3-1,s-acpr-q4-1,s-peff-q1-1,s-peff-q2-1,s-peff-q3-1,s-peff-q4-1,s-peff-q5-1,s-peff-q6-1
+2023-03-17 7:57:47,2023-03-17 8:09:15,0,71.174.81.214,100,1000,1,2023-03-17T8:9:15,1000,2,1740505,1,2,4,9,3,5,5,,,,,,,,,,,,,,,,,,,,,,,,,4,4,3,5,,,,,,,,,4,4,2,3,2,5,5,5,5,4,2,2,4,3,2,3,3,5,4,4,3,5,2,3,3,4,4,4,1,2,5,5,,,,,4,4,4,3,4,5
+2023-03-17 8:02:15,2023-03-17 8:08:02,0,71.174.81.214,25,1000,1,2023-03-17T8:8:3,1001,2,1740505,2,1,5,10,,,,,,,,,,,,,,,,,,,,5,4,4,4,,,,,,,,,2,3,2,,,,,,4,3,2,4,3,5,5,4,4,4,4,3,5,3,4,3,2,4,3,4,3,3,1,2,2,2,3,,,,,,5,4,4,5,4,4,5,3,3,4
+2023-03-17 8:00:05,2023-03-17 8:07:39,0,71.174.81.214,24,1000,1,2023-03-17T8:7:39,1002,2,1740505,3,,,9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,4,4,5,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
+2023-03-17 8:03:35,2023-03-17 8:15:38,0,71.174.81.214,0,1000,1,2023-03-17T8:15:38,1003,2,1740505,4,1,"1,4",10,4,5,5,,,,,,,,,,3,4,5,5,4,4,3,5,4,3,4,4,3,3,3,2,1,3,2,3,1,1,,,,,,,,,,,,,,,,,,,,,,5,4,3,5,3,4,3,3,3,5,4,5,4,4,5,5,5,5,5,5,,,,,,
+2023-03-17 7:57:09,2023-03-17 8:12:26,0,71.174.81.214,,1000,1,2023-03-17T8:12:27,1004,2,1740505,5,1,"5,4",9,4,4,5,,,,,,,,,,,,,,,,,,,,,,,,,4,4,2,4,,,,,,,,,3,3,3,3,3,5,5,4,5,5,3,3,4,2,2,1,3,2,4,5,4,5,4,3,1,3,4,3,1,3,3,1,,,,,4,4,3,4,2,4
+2023-03-17 8:01:50,2023-03-17 8:17:51,0,71.174.81.214,100,240,1,2023-03-17T8:17:52,1005,2,1740505,6,1,"5,4",9,4,3,4,,,,,,,,,,4,4,3,3,3,4,4,4,5,2,5,4,4,4,4,4,4,5,3,4,3,5,,,,,,4,4,4,5,2,,,,,,,,,,,,,,,,,,,,,,,4,1,4,4,4,5,4,4,5,4,5,5,5,5,4
+2023-03-17 8:01:45,2023-03-17 8:07:59,0,71.174.81.214,100,239,1,2023-03-17T8:8:0,1006,2,1740505,7,1,5,10,,,,,,,,,,,,,5,3,3,5,2,3,3,,,,,4,4,4,4,,,,,,,,,,,,,4,5,3,4,3,5,5,4,5,5,4,4,4,2,1,1,4,5,4,4,3,4,2,2,3,2,3,,,,,,,,,,4,4,5,4,4,5
+2023-03-17 9:07:09,2023-03-17 9:20:10,0,71.174.81.214,100,0,1,2023-03-17T9:20:11,1007,2,1740305,8,2,5,7,,,,,,,,,,,,,4,5,5,4,,4,3,,,,,5,4,4,5,,,,,,,,,,,,,5,5,4,5,5,5,5,5,5,4,5,5,5,3,2,3,3,3,5,4,4,4,3,3,4,,4,,,,,,,,,,4,5,5,5,5,5
+2023-03-17 8:02:11,2023-03-17 8:29:53,0,71.174.81.214,100,,1,2023-03-17T8:29:53,1008,2,1740505,9,1,"5,4",10,,,,,,,,,,,,,,,,,,,,3,3,2,3,,,,,,,,,2,3,3,,,,,,3,4,3,3,3,5,5,5,5,4,4,2,3,2,3,2,2,5,2,4,3,3,1,3,3,3,4,,,,,,4,4,4,4,4,4,4,4,2,4
+2023-03-17 8:00:42,2023-03-17 8:12:00,0,71.174.81.214,100,1000,1,2023-03-17T8:12:0,1009,2,1740505,10,2,5,1,,,,,,,,,,,,,1,4,3,2,1,3,2,,,,,3,3,4,4,,,,,,,,,,,,,5,5,2,4,5,4,4,5,4,2,2,1,2,2,4,3,5,5,4,4,1,4,1,2,1,3,3,,,,,,,,,,3,4,3,1,1,1
+2023-03-17 8:03:09,2023-03-17 8:13:27,0,71.174.81.214,100,1000,1,2023-03-17T8:13:28,1010,2,1740505,11,1,5,2,,,,,,,,,,,,,5,3,2,4,2,2,3,,,,,4,5,5,5,,,,,,,,,,,,,4,4,3,4,4,4,4,4,5,4,3,5,4,2,1,2,4,4,5,4,3,5,4,1,3,3,3,,,,,,,,,,4,3,4,4,4,4
+2023-03-17 8:23:20,2023-03-17 8:34:00,0,71.174.81.214,100,1000,1,2023-03-17T8:34:0,1011,2,1740505,12,2,3,3,1,2,2,2,3,2,4,2,5,5,3,5,3,3,3,2,3,4,4,2,4,3,5,4,4,4,3,4,3,3,4,3,1,3,,,,,,,,,,,,,,,,,,,,,,1,2,4,4,3,4,3,2,2,4,4,,,,,,,,,,,,,,,
+2023-03-17 8:36:36,2023-03-17 8:47:33,0,71.174.81.214,100,1000,1,2023-03-17T8:47:34,1012,2,1740505,13,1,3,4,4,5,4,,,,,,,,,,4,2,3,2,2,3,4,4,5,3,4,2,4,2,3,3,4,3,3,2,1,1,,,,,,,,,,,5,5,2,4,2,3,3,4,5,4,5,,,,,,,,,,,,4,2,3,3,2,4,4,3,3,,,,,,
+2023-03-17 8:01:10,2023-03-17 8:09:17,0,71.174.81.214,100,1000,1,2023-03-17T8:9:18,1013,2,1740505,14,1,4,5,4,3,5,,,,,,,,,,3,4,3,4,3,4,4,2,4,4,2,3,1,2,2,4,2,3,5,2,2,1,,,,,,4,4,3,3,4,,,,,,,,,,,,,,,,,,,,,,,2,1,3,2,5,4,5,2,2,2,3,4,3,3,4
+2023-03-17 10:06:07,2023-03-17 10:12:54,0,71.174.81.214,100,1000,1,2023-03-17T10:12:56,1014,2,1740505,15,1,5,6,,,,,,,,,,,,,3,3,4,2,1,2,4,,,,,2,2,2,2,,,,,,,,,,,,,1,2,1,2,3,4,5,3,5,3,1,2,3,2,2,1,2,4,3,2,3,2,4,2,3,2,2,,,,,,,,,,4,4,4,4,4,5
+2023-03-17 7:57:13,2023-03-17 8:05:02,0,71.174.81.214,100,1000,1,2023-03-17T8:5:2,1015,2,1740505,16,4,5,7,,,,,,,,,,,,,,,,,,,,3,3,3,3,,,,,,,,,3,5,2,,,,,,2,1,3,2,4,4,4,3,3,2,2,3,4,3,5,5,5,4,3,4,2,4,5,3,1,3,2,,,,,,4,4,3,4,4,5,4,5,5,5
+2023-03-17 7:57:50,2023-03-17 8:02:53,0,71.174.81.214,100,1000,1,2023-03-17T8:2:54,1016,2,1740505,17,2,5,8,1,1,1,,,,,,,,,,,,,,,,,,,,,,,,,5,2,1,1,,,,,,,,,5,4,3,3,3,2,4,3,3,5,1,1,1,1,3,5,1,1,4,5,4,1,3,1,3,2,1,1,1,1,1,1,,,,,1,1,1,1,1,2
+2023-03-17 8:40:22,2023-03-17 8:53:19,0,71.174.81.214,100,1000,0,2023-03-18T8:53:20,1017,2,1740505,18,2,5,9,,,,,,,,,,,,,,,,,3,3,4,,,,,1,1,1,2,4,3,2,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,2,2,1,3,2,3,2,4,,,,,,
+2023-03-17 8:37:13,2023-03-17 8:43:34,0,71.174.81.214,100,1000,1,2023-03-17T8:43:35,1018,2,1740505,19,2,2,10,,,,,,,,,,,,,,,,,,,,3,3,2,3,,,,,,,,,2,2,1,,,,,,4,4,4,4,4,4,5,4,5,3,4,3,3,4,5,4,,,3,2,3,4,1,4,2,3,4,,,,,,4,5,5,4,4,4,5,4,3,4
+2023-03-17 8:36:27,2023-03-17 8:44:07,0,71.174.81.214,100,1000,1,2023-03-17T8:44:8,1019,2,1740505,20,1,2,11,3,4,3,,,,,,,,,,2,3,3,2,4,5,4,3,4,3,5,3,4,4,5,4,5,3,4,5,4,4,,,,,,3,4,3,4,3,,,,,,,,,,,,,,,,,,,,,,,4,2,,2,4,4,5,2,4,5,5,4,3,3,4
+2023-03-17 8:33:55,2023-03-17 8:43:13,0,71.174.81.214,100,1000,1,2023-03-17T8:43:14,1020,2,1740505,21,1,5,12,3,1,3,,,,,,,,,,,,,,2,3,3,,4,,4,,4,,4,4,3,4,3,1,5,2,,,,,,3,4,2,3,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,4,5,5,5,4,,4,4,4
+2023-03-17 8:50:49,2023-03-17 9:13:18,0,71.174.81.214,100,1000,1,2023-03-17T9:13:19,1021,2,1740305,22,2,"5,2",1,,,,,,,,,,,,,2,3,3,3,2,3,3,,,,,3,4,4,4,,,,,,,,,,,,,2,2,1,3,2,5,5,4,5,4,2,2,2,1,1,1,1,3,3,4,3,4,4,1,1,1,3,,,,,,,,,,4,4,4,4,3,4
+2023-03-17 7:57:37,2023-03-17 8:04:25,0,71.174.81.214,100,1000,1,2023-03-17T8:4:25,1022,2,1740305,23,1,5,2,,,,,,,,,,,,,,,,,,,,4,4,4,5,,,,,,,,,4,4,5,,,,,,3,3,4,4,4,4,5,4,5,4,3,3,3,2,2,2,3,3,3,2,3,4,4,3,3,2,4,,,,,,3,2,4,3,3,3,3,4,2,2
+2023-03-17 8:01:47,2023-03-17 8:08:39,0,71.174.81.214,100,1000,1,2023-03-17T8:8:39,1023,2,1740305,24,1,"2,4",3,4,3,5,,,,,,,,,,2,2,2,2,1,2,2,4,3,2,3,2,3,3,4,3,4,2,3,4,3,2,,,,,,,,,,,5,4,3,5,3,1,2,2,2,2,4,,,,,,,,,,,,1,2,4,2,2,5,4,1,4,,,,,,
+2023-03-17 8:37:21,2023-03-17 8:58:16,0,71.174.81.214,100,1000,1,2023-03-17T8:58:16,1024,2,1740305,25,1,5,4,3,3,3,,,,,,,,,,,,,,,,,,,,,,,,,4,4,2,3,,,,,,,,,3,4,4,3,2,5,5,4,5,3,3,2,3,4,3,3,2,4,4,3,3,3,2,2,3,3,4,3,3,3,3,3,,,,,4,3,2,3,3,3
+2023-03-17 8:02:25,2023-03-17 8:11:16,0,71.174.81.214,100,1000,0,2023-03-18T8:11:21,1025,2,1740305,26,2,5,5,,,,,,,,,,,,,,,,,3,3,3,4,4,4,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,,1,3,,,,,,4,3,3,3,3,,,,,,,,,,
+2023-03-17 9:33:54,2023-03-17 9:57:21,0,71.174.81.214,100,1000,1,2023-03-17T9:57:22,1026,2,1740305,27,2,5,6,,,,,,,,,5,5,4,5,,,,,,,,1,2,2,2,,,,,,,,,1,1,2,5,4,4,4,3,5,4,,4,2,5,4,4,3,5,3,2,2,1,1,5,3,2,2,3,1,1,1,2,1,3,3,,,,,,,,,,,,,,,
+2023-03-17 9:48:38,2023-03-17 9:58:45,0,71.174.81.214,100,1000,1,2023-03-17T9:58:45,1027,2,1740305,28,1,5,7,,,,,,,,,2,4,4,2,,,,,,,,3,3,3,5,,,,,,,,,2,3,3,4,3,2,2,3,1,3,2,3,2,3,3,3,2,4,3,4,2,2,1,1,5,5,2,3,2,3,5,4,3,2,2,,,,,,,,,,,,,,,
+2023-03-17 8:36:40,2023-03-17 8:43:21,0,71.174.81.214,100,1000,1,2023-03-17T8:43:22,1028,2,1740305,29,2,5,8,,,,,,,,,,,,,,,,,,,,3,3,2,3,,,,,,,,,4,3,4,,,,,,3,3,2,4,3,5,4,4,5,4,4,2,3,4,2,5,4,4,3,3,3,3,2,2,3,1,4,,,,,,3,4,4,4,2,4,5,4,3,4
+2023-03-17 9:40:56,2023-03-17 9:52:52,0,71.174.81.214,100,1000,1,2023-03-17T9:52:52,1029,2,1740305,30,2,5,9,3,4,3,5,3,5,5,5,5,4,4,5,4,4,4,4,2,3,2,4,4,3,4,3,4,3,5,5,5,4,3,5,4,4,,,,,,,,,,,1,5,4,4,5,4,3,4,3,1,4,,,,,,,,,,,,,,,,,,,,,,,,,,
+2023-03-17 9:33:58,2023-03-17 9:48:33,0,71.174.81.214,100,1000,1,2023-03-17T9:48:33,1030,2,1740305,31,2,5,10,,,,,,,,,4,3,5,2,,,,,,,,5,4,4,5,,,,,,,,,5,2,4,4,4,2,2,3,2,2,1,1,2,1,2,2,5,5,5,4,4,3,1,1,1,1,3,2,3,3,3,4,4,4,2,,,,,,,,,,,,,,,
+2023-03-17 8:03:04,2023-03-17 8:23:33,0,71.174.81.214,100,1000,1,2023-03-17T8:23:33,1031,2,1740305,32,1,5,11,1,1,1,3,4,2,4,3,,,,,,,,,,,,,,,,,,,,5,5,5,5,,,,4,4,3,2,3,2,3,3,1,3,3,5,3,4,5,3,3,5,1,2,2,1,1,4,5,3,5,4,2,4,2,3,,,,,,,,,,,,,,,
+2023-03-17 8:33:14,2023-03-17 8:41:01,0,71.174.81.214,100,1000,1,2023-03-17T8:41:2,1032,2,1740305,33,1,5,12,,,,,,,,,,,,,2,1,1,1,2,2,3,,,,,2,1,3,2,,,,,,,,,,,,,1,1,1,1,1,4,3,4,5,4,1,1,2,3,2,3,4,2,2,3,3,2,2,2,1,2,3,,,,,,,,,,3,2,2,1,2,2
+2023-03-17 7:57:06,2023-03-17 8:08:35,0,71.174.81.214,100,1000,1,2023-03-17T8:8:35,1033,2,1740505,34,2,5,9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,2,2,2,2,2,2,2,2,2
+2023-03-17 7:58:38,2023-03-17 8:12:04,0,71.174.81.214,100,1000,1,2023-03-17T8:12:5,1034,2,1740505,35,2,"5,4",12,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
--- a/spec/services/cleaner_spec.rb
+++ b/spec/services/cleaner_spec.rb
@ -1,29 +1,29 @@
-require 'rails_helper'
-require 'fileutils'
+require "rails_helper"
+require "fileutils"

 RSpec.describe Cleaner do
-  let(:district) { create(:district, name: 'District1') }
-  let(:second_district) { create(:district, name: 'District2') }
+  let(:district) { create(:district, name: "District1") }
+  let(:second_district) { create(:district, name: "District2") }
  let(:school) { create(:school, dese_id: 1_740_505, district:) }
  let(:second_school) { create(:school, dese_id: 222_222, district: second_district) }

-  let(:academic_year) { create(:academic_year, range: '2022-23') }
+  let(:academic_year) { create(:academic_year, range: "2022-23") }
  let(:respondents) { create(:respondent, school:, academic_year:, nine: 40, ten: 40, eleven: 40, twelve: 40) }
-  let(:recorded_date) { '2023-04-01' }
+  let(:recorded_date) { "2023-04-01" }
  let(:input_filepath) do
-    Rails.root.join('spec', 'fixtures', 'raw')
+    Rails.root.join("spec", "fixtures", "raw")
  end

  let(:output_filepath) do
-    Rails.root.join('tmp', 'spec', 'clean')
+    Rails.root.join("tmp", "spec", "clean")
  end

  let(:log_filepath) do
-    Rails.root.join('tmp', 'spec', 'removed')
+    Rails.root.join("tmp", "spec", "removed")
  end

  let(:common_headers) do
-    ['Recorded Date', 'Dese ID', 'ResponseID']
+    ["Recorded Date", "Dese ID", "ResponseID"]
  end

  let(:standard_survey_items) do
@ -41,16 +41,16 @@ RSpec.describe Cleaner do
  end

  let(:short_form_survey_items) do
-    ([create(:survey_item, survey_item_id: 's-phys-q1', on_short_form: true),
-      create(:survey_item, survey_item_id: 's-phys-q2', on_short_form: true),
-      create(:survey_item, survey_item_id: 's-phys-q3',
+    ([create(:survey_item, survey_item_id: "s-phys-q1", on_short_form: true),
+      create(:survey_item, survey_item_id: "s-phys-q2", on_short_form: true),
+      create(:survey_item, survey_item_id: "s-phys-q3",
                           on_short_form: true)].map(&:survey_item_id) << common_headers).flatten
  end

  let(:early_education_survey_items) do
-    ([create(:survey_item, survey_item_id: 's-emsa-es1'),
-      create(:survey_item, survey_item_id: 's-emsa-es2'),
-      create(:survey_item, survey_item_id: 's-emsa-es3')].map(&:survey_item_id) << common_headers).flatten
+    ([create(:survey_item, survey_item_id: "s-emsa-es1"),
+      create(:survey_item, survey_item_id: "s-emsa-es2"),
+      create(:survey_item, survey_item_id: "s-emsa-es3")].map(&:survey_item_id) << common_headers).flatten
  end

  let(:teacher_survey_items) do
@ -79,84 +79,232 @@ RSpec.describe Cleaner do
    respondents
  end

-  context 'Creating a new Cleaner' do
-    it 'creates a directory for the clean data' do
+  context "Creating a new Cleaner" do
+    it "creates a directory for the clean data" do
      Cleaner.new(input_filepath:, output_filepath:, log_filepath:).clean
      expect(output_filepath).to exist
    end

-    it 'creates a directory for the removed data' do
+    it "creates a directory for the removed data" do
      Cleaner.new(input_filepath:, output_filepath:, log_filepath:).clean
      expect(log_filepath).to exist
    end
  end

-  context '.filename' do
-    context 'defines a filename in the format:  [district].[early_ed/short_form/standard/teacher].[year as 2022-23]' do
-      context 'when the file is based on standard survey items' do
-        it 'adds the survey type as standard to the filename' do
+  context ".process_raw_file" do
+    it "sorts data into valid and invalid csvs" do
+      cleaner = Cleaner.new(input_filepath:, output_filepath:, log_filepath:, disaggregation_filepath:)
+      processed_data = cleaner.process_raw_file(
+        file: path_to_sample_raw_file, disaggregation_data: cleaner.disaggregation_data
+      )
+      processed_data in [headers, clean_csv, log_csv, data]
+
+      reads_headers_from_raw_csv(processed_data)
+
+      valid_rows = %w[1000 1001 1004 1005 1008 1017 1018 1019 1020 1024 1025 1026
+                      1027 1028]
+      valid_rows.each do |response_id|
+        valid_row = data.find { |row| row.response_id == response_id }
+        expect(valid_row.valid?).to eq true
+      end
+
+      invalid_rows = %w[1002 1003 1006 1007 1009 1010 1011 1012 1013 1014 1015 1016 1021 1022 1023 1029 1030 1031 1032
+                        1033 1034]
+      invalid_rows.each do |response_id|
+        invalid_row = data.find { |row| row.response_id == response_id }
+        expect(invalid_row.valid?).to eq false
+      end
+
+      expect(clean_csv.length).to eq valid_rows.length + 1 # headers + rows
+      expect(log_csv.length).to eq invalid_rows.length + 1 # headers + rows
+
+      csv_contains_the_correct_rows(clean_csv, valid_rows)
+      csv_contains_the_correct_rows(log_csv, invalid_rows)
+      invalid_rows_are_rejected_for_the_correct_reasons(data)
+    end
+
+    it "adds dissaggregation data to the cleaned file " do
+      cleaner = Cleaner.new(input_filepath:, output_filepath:, log_filepath:, disaggregation_filepath:)
+      processed_data = cleaner.process_raw_file(
+        file: path_to_sample_raw_file, disaggregation_data: cleaner.disaggregation_data
+      )
+      processed_data in [headers, clean_csv, log_csv, data]
+      index_of_income = clean_csv.first.index("Income")
+      expect(clean_csv.second[index_of_income]).to eq "Economically Disadvantaged - Y"
+
+      one_thousand = data.find { |row| row.response_id == "1000" }
+      expect(one_thousand.income).to eq "Economically Disadvantaged - Y"
+
+      one_thousand_one = data.find { |row| row.response_id == "1001" }
+      expect(one_thousand_one.income).to eq "Economically Disadvantaged - N"
+    end
+  end
+
+  context ".filename" do
+    context "defines a filename in the format:  [district].[early_ed/short_form/standard/teacher].[year as 2022-23]" do
+      context "when the file is based on standard survey items" do
+        it "adds the survey type as standard to the filename" do
          survey_items = SurveyItem.where(survey_item_id: standard_survey_items)

-          data = [SurveyItemValues.new(row: { 'Recorded Date' => recorded_date, 'Dese ID' => '1_740_505' }, headers: standard_survey_items, genders: nil, survey_items:,
+          data = [SurveyItemValues.new(row: { "Recorded Date" => recorded_date, "Dese ID" => "1_740_505" }, headers: standard_survey_items, genders: nil, survey_items:,
                                       schools: School.school_hash)]
          filename = Cleaner.new(input_filepath:, output_filepath:, log_filepath:).filename(
            headers: standard_survey_items, data:
          )
-          expect(filename).to eq 'District1.standard.2022-23.csv'
+          expect(filename).to eq "District1.standard.2022-23.csv"
        end

-        context 'when the file is based on short form survey items' do
-          it 'adds the survey type as short form to the filename' do
+        context "when the file is based on short form survey items" do
+          it "adds the survey type as short form to the filename" do
            survey_items = SurveyItem.where(survey_item_id: short_form_survey_items)

-            data = [SurveyItemValues.new(row: { 'Recorded Date' => recorded_date, 'Dese ID' => '1_740_505' }, headers: short_form_survey_items, genders: nil, survey_items:,
+            data = [SurveyItemValues.new(row: { "Recorded Date" => recorded_date, "Dese ID" => "1_740_505" }, headers: short_form_survey_items, genders: nil, survey_items:,
                                         schools: School.school_hash)]
            filename = Cleaner.new(input_filepath:, output_filepath:, log_filepath:).filename(
              headers: short_form_survey_items, data:
            )
-            expect(filename).to eq 'District1.short_form.2022-23.csv'
+            expect(filename).to eq "District1.short_form.2022-23.csv"
          end
        end

-        context 'when the file is based on early education survey items' do
-          it 'adds the survey type as early education to the filename' do
+        context "when the file is based on early education survey items" do
+          it "adds the survey type as early education to the filename" do
            survey_items = SurveyItem.where(survey_item_id: early_education_survey_items)

-            data = [SurveyItemValues.new(row: { 'Recorded Date' => recorded_date, 'Dese ID' => '1_740_505' }, headers: early_education_survey_items, genders: nil, survey_items:,
+            data = [SurveyItemValues.new(row: { "Recorded Date" => recorded_date, "Dese ID" => "1_740_505" }, headers: early_education_survey_items, genders: nil, survey_items:,
                                         schools: School.school_hash)]
            filename = Cleaner.new(input_filepath:, output_filepath:, log_filepath:).filename(
              headers: early_education_survey_items, data:
            )
-            expect(filename).to eq 'District1.early_education.2022-23.csv'
+            expect(filename).to eq "District1.early_education.2022-23.csv"
          end
        end
-        context 'when the file is based on teacher survey items' do
-          it 'adds the survey type as teacher to the filename' do
+        context "when the file is based on teacher survey items" do
+          it "adds the survey type as teacher to the filename" do
            survey_items = SurveyItem.where(survey_item_id: teacher_survey_items)

-            data = [SurveyItemValues.new(row: { 'Recorded Date' => recorded_date, 'Dese ID' => '1_740_505' }, headers: teacher_survey_items, genders: nil, survey_items:,
+            data = [SurveyItemValues.new(row: { "Recorded Date" => recorded_date, "Dese ID" => "1_740_505" }, headers: teacher_survey_items, genders: nil, survey_items:,
                                         schools: School.school_hash)]
            filename = Cleaner.new(input_filepath:, output_filepath:, log_filepath:).filename(
              headers: teacher_survey_items, data:
            )
-            expect(filename).to eq 'District1.teacher.2022-23.csv'
+            expect(filename).to eq "District1.teacher.2022-23.csv"
          end
        end

-        context 'when there is more than one district' do
-          it 'adds all districts to the filename' do
+        context "when there is more than one district" do
+          it "adds all districts to the filename" do
            survey_items = SurveyItem.where(survey_item_id: teacher_survey_items)

-            data = [SurveyItemValues.new(row: { 'Recorded Date' => recorded_date, 'Dese ID' => '1_740_505' }, headers: teacher_survey_items, genders: nil, survey_items:, schools: School.school_hash),
-                    SurveyItemValues.new(row: { 'Recorded Date' => recorded_date, 'Dese ID' => '222_222' },
+            data = [SurveyItemValues.new(row: { "Recorded Date" => recorded_date, "Dese ID" => "1_740_505" }, headers: teacher_survey_items, genders: nil, survey_items:, schools: School.school_hash),
+                    SurveyItemValues.new(row: { "Recorded Date" => recorded_date, "Dese ID" => "222_222" },
                                         headers: teacher_survey_items, genders: nil, survey_items:, schools: School.school_hash)]
            filename = Cleaner.new(input_filepath:, output_filepath:, log_filepath:).filename(
              headers: teacher_survey_items, data:
            )
-            expect(filename).to eq 'District1.District2.teacher.2022-23.csv'
+            expect(filename).to eq "District1.District2.teacher.2022-23.csv"
+          end
        end
      end
    end
  end
+end
+
+def reads_headers_from_raw_csv(processed_data)
+  processed_data in [headers, clean_csv, log_csv, data]
+  expect(headers.to_set.sort).to eq ["StartDate", "EndDate", "Status", "IPAddress", "Progress", "Duration (in seconds)",
+                                     "Finished", "RecordedDate", "ResponseId", "District", "School",
+                                     "LASID", "Gender", "Race", "What grade are you in?", "s-emsa-q1", "s-emsa-q2", "s-emsa-q3", "s-tint-q1",
+                                     "s-tint-q2", "s-tint-q3", "s-tint-q4", "s-tint-q5", "s-acpr-q1", "s-acpr-q2",
+                                     "s-acpr-q3", "s-acpr-q4", "s-cure-q1", "s-cure-q2", "s-cure-q3", "s-cure-q4", "s-sten-q1", "s-sten-q2",
+                                     "s-sten-q3", "s-sper-q1", "s-sper-q2", "s-sper-q3", "s-sper-q4", "s-civp-q1", "s-civp-q2", "s-civp-q3",
+                                     "s-civp-q4", "s-grmi-q1", "s-grmi-q2", "s-grmi-q3", "s-grmi-q4", "s-appa-q1", "s-appa-q2", "s-appa-q3",
+                                     "s-peff-q1", "s-peff-q2", "s-peff-q3", "s-peff-q4", "s-peff-q5", "s-peff-q6", "s-sbel-q1", "s-sbel-q2",
+                                     "s-sbel-q3", "s-sbel-q4", "s-sbel-q5", "s-phys-q1", "s-phys-q2", "s-phys-q3", "s-phys-q4", "s-vale-q1",
+                                     "s-vale-q2", "s-vale-q3", "s-vale-q4", "s-acst-q1", "s-acst-q2", "s-acst-q3", "s-sust-q1", "s-sust-q2",
+                                     "s-grit-q1", "s-grit-q2", "s-grit-q3", "s-grit-q4", "s-expa-q1", "s-poaf-q1", "s-poaf-q2", "s-poaf-q3",
+                                     "s-poaf-q4", "s-tint-q1-1", "s-tint-q2-1", "s-tint-q3-1", "s-tint-q4-1", "s-tint-q5-1", "s-acpr-q1-1",
+                                     "s-acpr-q2-1", "s-acpr-q3-1", "s-acpr-q4-1", "s-peff-q1-1", "s-peff-q2-1", "s-peff-q3-1", "s-peff-q4-1",
+                                     "s-peff-q5-1", "s-peff-q6-1", "Raw Income", "Income"].to_set.sort
+end
+
+def invalid_rows_are_rejected_for_the_correct_reasons(data)
+  one_thousand_two = data.find { |row| row.response_id == "1002" }
+  expect(one_thousand_two.valid_progress?).to eq false
+  expect(one_thousand_two.valid_duration?).to eq true
+  expect(one_thousand_two.valid_grade?).to eq true
+  expect(one_thousand_two.valid_sd?).to eq true
+
+  one_thousand_three = data.find { |row| row.response_id == "1003" }
+  expect(one_thousand_three.valid_progress?).to eq false
+  expect(one_thousand_three.valid_duration?).to eq true
+  expect(one_thousand_three.valid_grade?).to eq true
+  expect(one_thousand_three.valid_sd?).to eq true
+
+  one_thousand_six = data.find { |row| row.response_id == "1006" }
+  expect(one_thousand_six.valid_progress?).to eq true
+  expect(one_thousand_six.valid_duration?).to eq false
+  expect(one_thousand_six.valid_grade?).to eq true
+  expect(one_thousand_six.valid_sd?).to eq true
+
+  one_thousand_seven = data.find { |row| row.response_id == "1007" }
+  expect(one_thousand_seven.valid_progress?).to eq true
+  expect(one_thousand_seven.valid_duration?).to eq false
+  expect(one_thousand_seven.valid_grade?).to eq true
+  expect(one_thousand_seven.valid_sd?).to eq true
+
+  one_thousand_seven = data.find { |row| row.response_id == "1007" }
+  expect(one_thousand_seven.valid_progress?).to eq true
+  expect(one_thousand_seven.valid_duration?).to eq false
+  expect(one_thousand_seven.valid_grade?).to eq true
+  expect(one_thousand_seven.valid_sd?).to eq true
+
+  one_thousand_nine = data.find { |row| row.response_id == "1009" }
+  expect(one_thousand_nine.valid_progress?).to eq true
+  expect(one_thousand_nine.valid_duration?).to eq true
+  expect(one_thousand_nine.valid_grade?).to eq false
+  expect(one_thousand_nine.valid_sd?).to eq true
+
+  one_thousand_ten = data.find { |row| row.response_id == "1010" }
+  expect(one_thousand_ten.valid_progress?).to eq true
+  expect(one_thousand_ten.valid_duration?).to eq true
+  expect(one_thousand_ten.valid_grade?).to eq false
+  expect(one_thousand_ten.valid_sd?).to eq true
+
+  one_thousand_eleven = data.find { |row| row.response_id == "1011" }
+  expect(one_thousand_eleven.valid_progress?).to eq true
+  expect(one_thousand_eleven.valid_duration?).to eq true
+  expect(one_thousand_eleven.valid_grade?).to eq false
+  expect(one_thousand_eleven.valid_sd?).to eq true
+
+  one_thousand_twenty_two = data.find { |row| row.response_id == "1022" }
+  expect(one_thousand_twenty_two.valid_progress?).to eq true
+  expect(one_thousand_twenty_two.valid_duration?).to eq true
+  expect(one_thousand_twenty_two.valid_grade?).to eq false
+  expect(one_thousand_twenty_two.valid_sd?).to eq true
+
+  one_thousand_twenty_three = data.find { |row| row.response_id == "1023" }
+  expect(one_thousand_twenty_three.valid_progress?).to eq true
+  expect(one_thousand_twenty_three.valid_duration?).to eq true
+  expect(one_thousand_twenty_three.valid_grade?).to eq false
+  expect(one_thousand_twenty_three.valid_sd?).to eq true
+
+  one_thousand_thirty_three = data.find { |row| row.response_id == "1033" }
+  expect(one_thousand_thirty_three.valid_progress?).to eq true
+  expect(one_thousand_thirty_three.valid_duration?).to eq true
+  expect(one_thousand_thirty_three.valid_grade?).to eq true
+  expect(one_thousand_thirty_three.valid_sd?).to eq false
+
+  one_thousand_thirty_four = data.find { |row| row.response_id == "1034" }
+  expect(one_thousand_thirty_four.valid_progress?).to eq true
+  expect(one_thousand_thirty_four.valid_duration?).to eq true
+  expect(one_thousand_thirty_four.valid_grade?).to eq true
+  expect(one_thousand_thirty_four.valid_sd?).to eq false
+end
+
+def csv_contains_the_correct_rows(csv, rows)
+  rows.each_with_index do |row, index|
+    response_id = 8 # eigth column
+    expect(csv[index + 1][response_id]).to eq row
  end
 end
 @ -1 +1 @@
 .2.1
 .2.2