fix: ensure cleaner outputs columns for all survey items. Before the fix, if a survey item varient (ending in -1, ie s-tint-q1-1) did not have a matching survey item s-tint-q1, the resulting csv would not include that column

2026-03-13 09:20:38 -07:00 · 2023-08-23 15:30:43 -07:00 · 2023-08-23 15:30:43 -07:00 · 7bd7923d41
commit 7bd7923d41
parent 2c9df34fac
8 changed files with 300 additions and 98 deletions
--- a/app/services/cleaner.rb
+++ b/app/services/cleaner.rb
@ -1,4 +1,4 @@
-require 'fileutils'
+require "fileutils"
 class Cleaner
  attr_reader :input_filepath, :output_filepath, :log_filepath, :clean_csv, :log_csv

@ -15,47 +15,24 @@ class Cleaner
  end

  def clean
-    Dir.glob(Rails.root.join(input_filepath, '*.csv')).each do |filepath|
+    Dir.glob(Rails.root.join(input_filepath, "*.csv")).each do |filepath|
      puts filepath
-      File.open(filepath) do |file|
+      File.open(filepath) do |_file|
        clean_csv = []
        log_csv = []
        data = []

-        headers = CSV.parse(file.first).first
-        filtered_headers = remove_unwanted_headers(headers:)
-        log_headers = (filtered_headers + ['Valid Duration?', 'Valid Progress?', 'Valid Grade?',
-                                           'Valid Standard Deviation?']).flatten
-
-        clean_csv << filtered_headers
-        log_csv << log_headers
-
-        all_survey_items = survey_items(headers:)
-
-        file.lazy.each_slice(1000) do |lines|
-          CSV.parse(lines.join, headers:).map do |row|
-            values = SurveyItemValues.new(row:, headers:, genders:,
-                                          survey_items: all_survey_items, schools:)
-            next unless values.valid_school?
-
-            data << values
-            values.valid? ? clean_csv << values.to_a : log_csv << (values.to_a << values.valid_duration?.to_s << values.valid_progress?.to_s << values.valid_grade?.to_s << values.valid_sd?.to_s)
-          end
-        end
-
-        unless data.empty?
-          filename = filename(headers:, data:)
-          write_csv(data: clean_csv, output_filepath:, filename:)
-          write_csv(data: log_csv, output_filepath: log_filepath, prefix: 'removed.', filename:)
-        end
+        filename = filename(headers:, data:)
+        write_csv(data: clean_csv, output_filepath:, filename:)
+        write_csv(data: log_csv, output_filepath: log_filepath, prefix: "removed.", filename:)
      end
    end
  end

  def filename(headers:, data:)
    survey_item_ids = headers.filter(&:present?).filter do |header|
-                        header.start_with?('s-', 't-')
-                      end.reject { |item| item.end_with? '-1' }
+                        header.start_with?("s-", "t-")
+                      end.reject { |item| item.end_with? "-1" }
    survey_type = SurveyItem.survey_type(survey_item_ids:)
    range = data.first.academic_year.range

@ -63,16 +40,62 @@ class Cleaner
      row.district.name
    end.to_set.to_a

-    districts.join('.').to_s + '.' + survey_type.to_s + '.' + range + '.csv'
+    districts.join(".").to_s + "." + survey_type.to_s + "." + range + ".csv"
+  end
+
+  def process_raw_file(file:, disaggregation_data:)
+    clean_csv = []
+    log_csv = []
+    data = []
+
+    headers = (CSV.parse(file.first).first << "Raw Income") << "Income"
+    filtered_headers = include_all_headers(headers:)
+    filtered_headers = remove_unwanted_headers(headers: filtered_headers)
+    log_headers = (filtered_headers + ["Valid Duration?", "Valid Progress?", "Valid Grade?",
+                                       "Valid Standard Deviation?"]).flatten
+
+    clean_csv << filtered_headers
+    log_csv << log_headers
+
+    all_survey_items = survey_items(headers:)
+
+    file.lazy.each_slice(1000) do |lines|
+      CSV.parse(lines.join, headers:).map do |row|
+        values = SurveyItemValues.new(row:, headers:, genders:,
+                                      survey_items: all_survey_items, schools:, disaggregation_data:)
+        next unless values.valid_school?
+
+        data << values
+        values.valid? ? clean_csv << values.to_a : log_csv << (values.to_a << values.valid_duration?.to_s << values.valid_progress?.to_s << values.valid_grade?.to_s << values.valid_sd?.to_s)
+      end
+    end
+    [headers, clean_csv, log_csv, data]
+  end
+
+  private
+
+  def include_all_headers(headers:)
+    alternates = headers.filter(&:present?)
+                        .filter { |header| header.end_with? "-1" }
+    alternates.each do |header|
+      main = header.sub(/-1\z/, "")
+      headers.push(main) unless headers.include?(main)
+    end
+    headers
+  end
+
+  def initialize_directories
+    create_ouput_directory
+    create_log_directory
  end

  def remove_unwanted_headers(headers:)
    headers.to_set.to_a.compact.reject do |item|
-      item.start_with? 'Q'
-    end.reject { |item| item.end_with? '-1' }
+      item.start_with? "Q"
+    end.reject { |item| item.end_with? "-1" }
  end

-  def write_csv(data:, output_filepath:, filename:, prefix: '')
+  def write_csv(data:, output_filepath:, filename:, prefix: "")
    csv = CSV.generate do |csv|
      data.each do |row|
        csv << row
@ -102,13 +125,10 @@ class Cleaner
  end

  def survey_items(headers:)
-    @survey_items ||= SurveyItem.where(survey_item_id: get_survey_item_ids_from_headers(headers:))
-  end
-
-  def get_survey_item_ids_from_headers(headers:)
-    headers
-      .filter(&:present?)
-      .filter { |header| header.start_with? 't-', 's-' }
+    survey_item_ids = headers
+                      .filter(&:present?)
+                      .filter { |header| header.start_with? "t-", "s-" }
+    @survey_items ||= SurveyItem.where(survey_item_id: survey_item_ids)
  end

  def create_ouput_directory
--- a/app/services/survey_item_values.rb
+++ b/app/services/survey_item_values.rb
@ -7,6 +7,11 @@ class SurveyItemValues
    @genders = genders
    @survey_items = survey_items
    @schools = schools
+    @disaggregation_data = disaggregation_data
+
+    copy_likert_scores_from_variant_survey_items
+    row["Income"] = income
+    row["Raw Income"] = raw_income
  end

  # Some survey items have variants, i.e.  a survey item with an id of s-tint-q1 might have a variant that looks like s-tint-q1-1.  We must ensure that all variants in the form of s-tint-q1-1 have a matching pair.
@ -115,6 +120,8 @@ class SurveyItemValues

    return "Unknown" unless disaggregation_data.present?

+    byebug
+
    disaggregation = disaggregation_data[[lasid, district.name, academic_year.range]]
    return "Unknown" unless disaggregation.present?

@ -147,7 +154,6 @@ class SurveyItemValues
  end

  def to_a
-    copy_likert_scores_from_variant_survey_items
    headers.select(&:present?)
           .reject { |key, _value| key.start_with? "Q" }
           .reject { |key, _value| key.end_with? "-1" }
@ -238,7 +244,8 @@ class SurveyItemValues
    headers.filter(&:present?).filter { |header| header.end_with? "-1" }.each do |header|
      likert_score = row[header]
      main_item = header.gsub("-1", "")
-      row[main_item] = likert_score if likert_score.present?
+      row[main_item] = likert_score if likert_score.present? && row[main_item].blank?
    end
  end
 end
+
--- a/app/views/analyze/_grouped_bar_column.html.erb
+++ b/app/views/analyze/_grouped_bar_column.html.erb
@ -1,14 +1,6 @@
 <g class="grouped-bar-column" data-for-measure-id="<%= column.measure.measure_id %>">
 <% score_label_y = [5, 10, 15, 5, 10, 15 ] %>
  <% column.bars.each_with_index do |bar, index| %>
-<<<<<<< HEAD
-    <rect data-for-academic-year="<%= bar.academic_year.range %>" x="<%= bar.x_position %>%" y="<%= bar.y_offset %>%" width="<%= column.bar_width %>%" height="<%= bar.bar_height_percentage %>%" fill="<%= bar.color %>" />
-
-    <% if ENV["SCORES"].present?  && ENV["SCORES"].upcase == "SHOW" %>
-      <text x="<%= bar.x_position + (column.bar_width * 0.5) %>%" y="<%= score_label_y[index] %>%" text-anchor="middle" dominant-baseline="middle">
-        <%= bar.average %>
-      </text>
-=======
    <% if column.sufficient_data?(index)   %>
      <rect
      <% if column.show_popover? %>
@ -29,7 +21,6 @@
        </text>
      <% end %>

->>>>>>> 67e469a6 (feat: add popover to analyze graphs that displays the n-size of the different columns.  Make sure to only calculate a score for a race if there are more than 10 respondents to a question.)
    <% end %>
  <% end %>