fix: ensure cleaner outputs columns for all survey items. Before the fix, if a survey item varient (ending in -1, ie s-tint-q1-1) did not have a matching survey item s-tint-q1, the resulting csv would not include that column

This commit is contained in:
rebuilt 2023-08-23 15:30:43 -07:00
parent 2c9df34fac
commit 7bd7923d41
8 changed files with 300 additions and 98 deletions

View file

@ -1,4 +1,4 @@
require 'fileutils'
require "fileutils"
class Cleaner
attr_reader :input_filepath, :output_filepath, :log_filepath, :clean_csv, :log_csv
@ -15,47 +15,24 @@ class Cleaner
end
def clean
Dir.glob(Rails.root.join(input_filepath, '*.csv')).each do |filepath|
Dir.glob(Rails.root.join(input_filepath, "*.csv")).each do |filepath|
puts filepath
File.open(filepath) do |file|
File.open(filepath) do |_file|
clean_csv = []
log_csv = []
data = []
headers = CSV.parse(file.first).first
filtered_headers = remove_unwanted_headers(headers:)
log_headers = (filtered_headers + ['Valid Duration?', 'Valid Progress?', 'Valid Grade?',
'Valid Standard Deviation?']).flatten
clean_csv << filtered_headers
log_csv << log_headers
all_survey_items = survey_items(headers:)
file.lazy.each_slice(1000) do |lines|
CSV.parse(lines.join, headers:).map do |row|
values = SurveyItemValues.new(row:, headers:, genders:,
survey_items: all_survey_items, schools:)
next unless values.valid_school?
data << values
values.valid? ? clean_csv << values.to_a : log_csv << (values.to_a << values.valid_duration?.to_s << values.valid_progress?.to_s << values.valid_grade?.to_s << values.valid_sd?.to_s)
end
end
unless data.empty?
filename = filename(headers:, data:)
write_csv(data: clean_csv, output_filepath:, filename:)
write_csv(data: log_csv, output_filepath: log_filepath, prefix: 'removed.', filename:)
end
filename = filename(headers:, data:)
write_csv(data: clean_csv, output_filepath:, filename:)
write_csv(data: log_csv, output_filepath: log_filepath, prefix: "removed.", filename:)
end
end
end
def filename(headers:, data:)
survey_item_ids = headers.filter(&:present?).filter do |header|
header.start_with?('s-', 't-')
end.reject { |item| item.end_with? '-1' }
header.start_with?("s-", "t-")
end.reject { |item| item.end_with? "-1" }
survey_type = SurveyItem.survey_type(survey_item_ids:)
range = data.first.academic_year.range
@ -63,16 +40,62 @@ class Cleaner
row.district.name
end.to_set.to_a
districts.join('.').to_s + '.' + survey_type.to_s + '.' + range + '.csv'
districts.join(".").to_s + "." + survey_type.to_s + "." + range + ".csv"
end
def process_raw_file(file:, disaggregation_data:)
clean_csv = []
log_csv = []
data = []
headers = (CSV.parse(file.first).first << "Raw Income") << "Income"
filtered_headers = include_all_headers(headers:)
filtered_headers = remove_unwanted_headers(headers: filtered_headers)
log_headers = (filtered_headers + ["Valid Duration?", "Valid Progress?", "Valid Grade?",
"Valid Standard Deviation?"]).flatten
clean_csv << filtered_headers
log_csv << log_headers
all_survey_items = survey_items(headers:)
file.lazy.each_slice(1000) do |lines|
CSV.parse(lines.join, headers:).map do |row|
values = SurveyItemValues.new(row:, headers:, genders:,
survey_items: all_survey_items, schools:, disaggregation_data:)
next unless values.valid_school?
data << values
values.valid? ? clean_csv << values.to_a : log_csv << (values.to_a << values.valid_duration?.to_s << values.valid_progress?.to_s << values.valid_grade?.to_s << values.valid_sd?.to_s)
end
end
[headers, clean_csv, log_csv, data]
end
private
def include_all_headers(headers:)
alternates = headers.filter(&:present?)
.filter { |header| header.end_with? "-1" }
alternates.each do |header|
main = header.sub(/-1\z/, "")
headers.push(main) unless headers.include?(main)
end
headers
end
def initialize_directories
create_ouput_directory
create_log_directory
end
def remove_unwanted_headers(headers:)
headers.to_set.to_a.compact.reject do |item|
item.start_with? 'Q'
end.reject { |item| item.end_with? '-1' }
item.start_with? "Q"
end.reject { |item| item.end_with? "-1" }
end
def write_csv(data:, output_filepath:, filename:, prefix: '')
def write_csv(data:, output_filepath:, filename:, prefix: "")
csv = CSV.generate do |csv|
data.each do |row|
csv << row
@ -102,13 +125,10 @@ class Cleaner
end
def survey_items(headers:)
@survey_items ||= SurveyItem.where(survey_item_id: get_survey_item_ids_from_headers(headers:))
end
def get_survey_item_ids_from_headers(headers:)
headers
.filter(&:present?)
.filter { |header| header.start_with? 't-', 's-' }
survey_item_ids = headers
.filter(&:present?)
.filter { |header| header.start_with? "t-", "s-" }
@survey_items ||= SurveyItem.where(survey_item_id: survey_item_ids)
end
def create_ouput_directory

View file

@ -7,6 +7,11 @@ class SurveyItemValues
@genders = genders
@survey_items = survey_items
@schools = schools
@disaggregation_data = disaggregation_data
copy_likert_scores_from_variant_survey_items
row["Income"] = income
row["Raw Income"] = raw_income
end
# Some survey items have variants, i.e. a survey item with an id of s-tint-q1 might have a variant that looks like s-tint-q1-1. We must ensure that all variants in the form of s-tint-q1-1 have a matching pair.
@ -115,6 +120,8 @@ class SurveyItemValues
return "Unknown" unless disaggregation_data.present?
byebug
disaggregation = disaggregation_data[[lasid, district.name, academic_year.range]]
return "Unknown" unless disaggregation.present?
@ -147,7 +154,6 @@ class SurveyItemValues
end
def to_a
copy_likert_scores_from_variant_survey_items
headers.select(&:present?)
.reject { |key, _value| key.start_with? "Q" }
.reject { |key, _value| key.end_with? "-1" }
@ -238,7 +244,8 @@ class SurveyItemValues
headers.filter(&:present?).filter { |header| header.end_with? "-1" }.each do |header|
likert_score = row[header]
main_item = header.gsub("-1", "")
row[main_item] = likert_score if likert_score.present?
row[main_item] = likert_score if likert_score.present? && row[main_item].blank?
end
end
end

View file

@ -1,14 +1,6 @@
<g class="grouped-bar-column" data-for-measure-id="<%= column.measure.measure_id %>">
<% score_label_y = [5, 10, 15, 5, 10, 15 ] %>
<% column.bars.each_with_index do |bar, index| %>
<<<<<<< HEAD
<rect data-for-academic-year="<%= bar.academic_year.range %>" x="<%= bar.x_position %>%" y="<%= bar.y_offset %>%" width="<%= column.bar_width %>%" height="<%= bar.bar_height_percentage %>%" fill="<%= bar.color %>" />
<% if ENV["SCORES"].present? && ENV["SCORES"].upcase == "SHOW" %>
<text x="<%= bar.x_position + (column.bar_width * 0.5) %>%" y="<%= score_label_y[index] %>%" text-anchor="middle" dominant-baseline="middle">
<%= bar.average %>
</text>
=======
<% if column.sufficient_data?(index) %>
<rect
<% if column.show_popover? %>
@ -29,7 +21,6 @@
</text>
<% end %>
>>>>>>> 67e469a6 (feat: add popover to analyze graphs that displays the n-size of the different columns. Make sure to only calculate a score for a race if there are more than 10 respondents to a question.)
<% end %>
<% end %>