diff --git a/.ruby-version b/.ruby-version index e4604e3a..be94e6f5 100644 --- a/.ruby-version +++ b/.ruby-version @@ -1 +1 @@ -3.2.1 +3.2.2 diff --git a/Gemfile b/Gemfile index 1be44d75..9e802c78 100644 --- a/Gemfile +++ b/Gemfile @@ -1,5 +1,5 @@ source "https://rubygems.org" -ruby "3.2.1" +ruby "3.2.2" git_source(:github) do |repo_name| repo_name = "#{repo_name}/#{repo_name}" unless repo_name.include?("/") diff --git a/Gemfile.lock b/Gemfile.lock index ca5d126a..dc8c3350 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -506,7 +506,7 @@ DEPENDENCIES watir RUBY VERSION - ruby 3.2.1p31 + ruby 3.2.2p53 BUNDLED WITH 2.3.3 diff --git a/app/services/cleaner.rb b/app/services/cleaner.rb index 8f407198..e457092d 100644 --- a/app/services/cleaner.rb +++ b/app/services/cleaner.rb @@ -1,4 +1,4 @@ -require 'fileutils' +require "fileutils" class Cleaner attr_reader :input_filepath, :output_filepath, :log_filepath, :clean_csv, :log_csv @@ -15,47 +15,24 @@ class Cleaner end def clean - Dir.glob(Rails.root.join(input_filepath, '*.csv')).each do |filepath| + Dir.glob(Rails.root.join(input_filepath, "*.csv")).each do |filepath| puts filepath - File.open(filepath) do |file| + File.open(filepath) do |_file| clean_csv = [] log_csv = [] data = [] - headers = CSV.parse(file.first).first - filtered_headers = remove_unwanted_headers(headers:) - log_headers = (filtered_headers + ['Valid Duration?', 'Valid Progress?', 'Valid Grade?', - 'Valid Standard Deviation?']).flatten - - clean_csv << filtered_headers - log_csv << log_headers - - all_survey_items = survey_items(headers:) - - file.lazy.each_slice(1000) do |lines| - CSV.parse(lines.join, headers:).map do |row| - values = SurveyItemValues.new(row:, headers:, genders:, - survey_items: all_survey_items, schools:) - next unless values.valid_school? - - data << values - values.valid? ? clean_csv << values.to_a : log_csv << (values.to_a << values.valid_duration?.to_s << values.valid_progress?.to_s << values.valid_grade?.to_s << values.valid_sd?.to_s) - end - end - - unless data.empty? - filename = filename(headers:, data:) - write_csv(data: clean_csv, output_filepath:, filename:) - write_csv(data: log_csv, output_filepath: log_filepath, prefix: 'removed.', filename:) - end + filename = filename(headers:, data:) + write_csv(data: clean_csv, output_filepath:, filename:) + write_csv(data: log_csv, output_filepath: log_filepath, prefix: "removed.", filename:) end end end def filename(headers:, data:) survey_item_ids = headers.filter(&:present?).filter do |header| - header.start_with?('s-', 't-') - end.reject { |item| item.end_with? '-1' } + header.start_with?("s-", "t-") + end.reject { |item| item.end_with? "-1" } survey_type = SurveyItem.survey_type(survey_item_ids:) range = data.first.academic_year.range @@ -63,16 +40,62 @@ class Cleaner row.district.name end.to_set.to_a - districts.join('.').to_s + '.' + survey_type.to_s + '.' + range + '.csv' + districts.join(".").to_s + "." + survey_type.to_s + "." + range + ".csv" + end + + def process_raw_file(file:, disaggregation_data:) + clean_csv = [] + log_csv = [] + data = [] + + headers = (CSV.parse(file.first).first << "Raw Income") << "Income" + filtered_headers = include_all_headers(headers:) + filtered_headers = remove_unwanted_headers(headers: filtered_headers) + log_headers = (filtered_headers + ["Valid Duration?", "Valid Progress?", "Valid Grade?", + "Valid Standard Deviation?"]).flatten + + clean_csv << filtered_headers + log_csv << log_headers + + all_survey_items = survey_items(headers:) + + file.lazy.each_slice(1000) do |lines| + CSV.parse(lines.join, headers:).map do |row| + values = SurveyItemValues.new(row:, headers:, genders:, + survey_items: all_survey_items, schools:, disaggregation_data:) + next unless values.valid_school? + + data << values + values.valid? ? clean_csv << values.to_a : log_csv << (values.to_a << values.valid_duration?.to_s << values.valid_progress?.to_s << values.valid_grade?.to_s << values.valid_sd?.to_s) + end + end + [headers, clean_csv, log_csv, data] + end + + private + + def include_all_headers(headers:) + alternates = headers.filter(&:present?) + .filter { |header| header.end_with? "-1" } + alternates.each do |header| + main = header.sub(/-1\z/, "") + headers.push(main) unless headers.include?(main) + end + headers + end + + def initialize_directories + create_ouput_directory + create_log_directory end def remove_unwanted_headers(headers:) headers.to_set.to_a.compact.reject do |item| - item.start_with? 'Q' - end.reject { |item| item.end_with? '-1' } + item.start_with? "Q" + end.reject { |item| item.end_with? "-1" } end - def write_csv(data:, output_filepath:, filename:, prefix: '') + def write_csv(data:, output_filepath:, filename:, prefix: "") csv = CSV.generate do |csv| data.each do |row| csv << row @@ -102,13 +125,10 @@ class Cleaner end def survey_items(headers:) - @survey_items ||= SurveyItem.where(survey_item_id: get_survey_item_ids_from_headers(headers:)) - end - - def get_survey_item_ids_from_headers(headers:) - headers - .filter(&:present?) - .filter { |header| header.start_with? 't-', 's-' } + survey_item_ids = headers + .filter(&:present?) + .filter { |header| header.start_with? "t-", "s-" } + @survey_items ||= SurveyItem.where(survey_item_id: survey_item_ids) end def create_ouput_directory diff --git a/app/services/survey_item_values.rb b/app/services/survey_item_values.rb index 34bc0dec..fb1ddd18 100644 --- a/app/services/survey_item_values.rb +++ b/app/services/survey_item_values.rb @@ -7,6 +7,11 @@ class SurveyItemValues @genders = genders @survey_items = survey_items @schools = schools + @disaggregation_data = disaggregation_data + + copy_likert_scores_from_variant_survey_items + row["Income"] = income + row["Raw Income"] = raw_income end # Some survey items have variants, i.e. a survey item with an id of s-tint-q1 might have a variant that looks like s-tint-q1-1. We must ensure that all variants in the form of s-tint-q1-1 have a matching pair. @@ -115,6 +120,8 @@ class SurveyItemValues return "Unknown" unless disaggregation_data.present? + byebug + disaggregation = disaggregation_data[[lasid, district.name, academic_year.range]] return "Unknown" unless disaggregation.present? @@ -147,7 +154,6 @@ class SurveyItemValues end def to_a - copy_likert_scores_from_variant_survey_items headers.select(&:present?) .reject { |key, _value| key.start_with? "Q" } .reject { |key, _value| key.end_with? "-1" } @@ -238,7 +244,8 @@ class SurveyItemValues headers.filter(&:present?).filter { |header| header.end_with? "-1" }.each do |header| likert_score = row[header] main_item = header.gsub("-1", "") - row[main_item] = likert_score if likert_score.present? + row[main_item] = likert_score if likert_score.present? && row[main_item].blank? end end end + diff --git a/app/views/analyze/_grouped_bar_column.html.erb b/app/views/analyze/_grouped_bar_column.html.erb index 4e15b9ce..2f7fa614 100644 --- a/app/views/analyze/_grouped_bar_column.html.erb +++ b/app/views/analyze/_grouped_bar_column.html.erb @@ -1,14 +1,6 @@ <% score_label_y = [5, 10, 15, 5, 10, 15 ] %> <% column.bars.each_with_index do |bar, index| %> -<<<<<<< HEAD - - - <% if ENV["SCORES"].present? && ENV["SCORES"].upcase == "SHOW" %> - - <%= bar.average %> - -======= <% if column.sufficient_data?(index) %> @@ -29,7 +21,6 @@ <% end %> ->>>>>>> 67e469a6 (feat: add popover to analyze graphs that displays the n-size of the different columns. Make sure to only calculate a score for a race if there are more than 10 respondents to a question.) <% end %> <% end %> diff --git a/spec/fixtures/raw/sample_maynard_raw_student_survey.csv b/spec/fixtures/raw/sample_maynard_raw_student_survey.csv new file mode 100644 index 00000000..0317422c --- /dev/null +++ b/spec/fixtures/raw/sample_maynard_raw_student_survey.csv @@ -0,0 +1,36 @@ +StartDate,EndDate,Status,IPAddress,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,District,School,LASID,Gender,Race,What grade are you in?,s-emsa-q1,s-emsa-q2,s-emsa-q3,s-tint-q1,s-tint-q2,s-tint-q3,s-tint-q4,s-tint-q5,s-acpr-q1,s-acpr-q2,s-acpr-q3,s-acpr-q4,s-cure-q1,s-cure-q2,s-cure-q3,s-cure-q4,s-sten-q1,s-sten-q2,s-sten-q3,s-sper-q1,s-sper-q2,s-sper-q3,s-sper-q4,s-civp-q1,s-civp-q2,s-civp-q3,s-civp-q4,s-grmi-q1,s-grmi-q2,s-grmi-q3,s-grmi-q4,s-appa-q1,s-appa-q2,s-appa-q3,s-peff-q1,s-peff-q2,s-peff-q3,s-peff-q4,s-peff-q5,s-sbel-q1,s-sbel-q2,s-sbel-q3,s-sbel-q4,s-sbel-q5,s-phys-q1,s-phys-q2,s-phys-q3,s-phys-q4,s-vale-q1,s-vale-q2,s-vale-q3,s-vale-q4,s-acst-q1,s-acst-q2,s-acst-q3,s-sust-q1,s-sust-q2,s-grit-q1,s-grit-q2,s-grit-q3,s-grit-q4,s-expa-q1,s-poaf-q1,s-poaf-q2,s-poaf-q3,s-poaf-q4,s-tint-q1-1,s-tint-q2-1,s-tint-q3-1,s-tint-q4-1,s-tint-q5-1,s-acpr-q1-1,s-acpr-q2-1,s-acpr-q3-1,s-acpr-q4-1,s-peff-q1-1,s-peff-q2-1,s-peff-q3-1,s-peff-q4-1,s-peff-q5-1,s-peff-q6-1 +2023-03-17 7:57:47,2023-03-17 8:09:15,0,71.174.81.214,100,1000,1,2023-03-17T8:9:15,1000,2,1740505,1,2,4,9,3,5,5,,,,,,,,,,,,,,,,,,,,,,,,,4,4,3,5,,,,,,,,,4,4,2,3,2,5,5,5,5,4,2,2,4,3,2,3,3,5,4,4,3,5,2,3,3,4,4,4,1,2,5,5,,,,,4,4,4,3,4,5 +2023-03-17 8:02:15,2023-03-17 8:08:02,0,71.174.81.214,25,1000,1,2023-03-17T8:8:3,1001,2,1740505,2,1,5,10,,,,,,,,,,,,,,,,,,,,5,4,4,4,,,,,,,,,2,3,2,,,,,,4,3,2,4,3,5,5,4,4,4,4,3,5,3,4,3,2,4,3,4,3,3,1,2,2,2,3,,,,,,5,4,4,5,4,4,5,3,3,4 +2023-03-17 8:00:05,2023-03-17 8:07:39,0,71.174.81.214,24,1000,1,2023-03-17T8:7:39,1002,2,1740505,3,,,9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,4,4,5,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, +2023-03-17 8:03:35,2023-03-17 8:15:38,0,71.174.81.214,0,1000,1,2023-03-17T8:15:38,1003,2,1740505,4,1,"1,4",10,4,5,5,,,,,,,,,,3,4,5,5,4,4,3,5,4,3,4,4,3,3,3,2,1,3,2,3,1,1,,,,,,,,,,,,,,,,,,,,,,5,4,3,5,3,4,3,3,3,5,4,5,4,4,5,5,5,5,5,5,,,,,, +2023-03-17 7:57:09,2023-03-17 8:12:26,0,71.174.81.214,,1000,1,2023-03-17T8:12:27,1004,2,1740505,5,1,"5,4",9,4,4,5,,,,,,,,,,,,,,,,,,,,,,,,,4,4,2,4,,,,,,,,,3,3,3,3,3,5,5,4,5,5,3,3,4,2,2,1,3,2,4,5,4,5,4,3,1,3,4,3,1,3,3,1,,,,,4,4,3,4,2,4 +2023-03-17 8:01:50,2023-03-17 8:17:51,0,71.174.81.214,100,240,1,2023-03-17T8:17:52,1005,2,1740505,6,1,"5,4",9,4,3,4,,,,,,,,,,4,4,3,3,3,4,4,4,5,2,5,4,4,4,4,4,4,5,3,4,3,5,,,,,,4,4,4,5,2,,,,,,,,,,,,,,,,,,,,,,,4,1,4,4,4,5,4,4,5,4,5,5,5,5,4 +2023-03-17 8:01:45,2023-03-17 8:07:59,0,71.174.81.214,100,239,1,2023-03-17T8:8:0,1006,2,1740505,7,1,5,10,,,,,,,,,,,,,5,3,3,5,2,3,3,,,,,4,4,4,4,,,,,,,,,,,,,4,5,3,4,3,5,5,4,5,5,4,4,4,2,1,1,4,5,4,4,3,4,2,2,3,2,3,,,,,,,,,,4,4,5,4,4,5 +2023-03-17 9:07:09,2023-03-17 9:20:10,0,71.174.81.214,100,0,1,2023-03-17T9:20:11,1007,2,1740305,8,2,5,7,,,,,,,,,,,,,4,5,5,4,,4,3,,,,,5,4,4,5,,,,,,,,,,,,,5,5,4,5,5,5,5,5,5,4,5,5,5,3,2,3,3,3,5,4,4,4,3,3,4,,4,,,,,,,,,,4,5,5,5,5,5 +2023-03-17 8:02:11,2023-03-17 8:29:53,0,71.174.81.214,100,,1,2023-03-17T8:29:53,1008,2,1740505,9,1,"5,4",10,,,,,,,,,,,,,,,,,,,,3,3,2,3,,,,,,,,,2,3,3,,,,,,3,4,3,3,3,5,5,5,5,4,4,2,3,2,3,2,2,5,2,4,3,3,1,3,3,3,4,,,,,,4,4,4,4,4,4,4,4,2,4 +2023-03-17 8:00:42,2023-03-17 8:12:00,0,71.174.81.214,100,1000,1,2023-03-17T8:12:0,1009,2,1740505,10,2,5,1,,,,,,,,,,,,,1,4,3,2,1,3,2,,,,,3,3,4,4,,,,,,,,,,,,,5,5,2,4,5,4,4,5,4,2,2,1,2,2,4,3,5,5,4,4,1,4,1,2,1,3,3,,,,,,,,,,3,4,3,1,1,1 +2023-03-17 8:03:09,2023-03-17 8:13:27,0,71.174.81.214,100,1000,1,2023-03-17T8:13:28,1010,2,1740505,11,1,5,2,,,,,,,,,,,,,5,3,2,4,2,2,3,,,,,4,5,5,5,,,,,,,,,,,,,4,4,3,4,4,4,4,4,5,4,3,5,4,2,1,2,4,4,5,4,3,5,4,1,3,3,3,,,,,,,,,,4,3,4,4,4,4 +2023-03-17 8:23:20,2023-03-17 8:34:00,0,71.174.81.214,100,1000,1,2023-03-17T8:34:0,1011,2,1740505,12,2,3,3,1,2,2,2,3,2,4,2,5,5,3,5,3,3,3,2,3,4,4,2,4,3,5,4,4,4,3,4,3,3,4,3,1,3,,,,,,,,,,,,,,,,,,,,,,1,2,4,4,3,4,3,2,2,4,4,,,,,,,,,,,,,,, +2023-03-17 8:36:36,2023-03-17 8:47:33,0,71.174.81.214,100,1000,1,2023-03-17T8:47:34,1012,2,1740505,13,1,3,4,4,5,4,,,,,,,,,,4,2,3,2,2,3,4,4,5,3,4,2,4,2,3,3,4,3,3,2,1,1,,,,,,,,,,,5,5,2,4,2,3,3,4,5,4,5,,,,,,,,,,,,4,2,3,3,2,4,4,3,3,,,,,, +2023-03-17 8:01:10,2023-03-17 8:09:17,0,71.174.81.214,100,1000,1,2023-03-17T8:9:18,1013,2,1740505,14,1,4,5,4,3,5,,,,,,,,,,3,4,3,4,3,4,4,2,4,4,2,3,1,2,2,4,2,3,5,2,2,1,,,,,,4,4,3,3,4,,,,,,,,,,,,,,,,,,,,,,,2,1,3,2,5,4,5,2,2,2,3,4,3,3,4 +2023-03-17 10:06:07,2023-03-17 10:12:54,0,71.174.81.214,100,1000,1,2023-03-17T10:12:56,1014,2,1740505,15,1,5,6,,,,,,,,,,,,,3,3,4,2,1,2,4,,,,,2,2,2,2,,,,,,,,,,,,,1,2,1,2,3,4,5,3,5,3,1,2,3,2,2,1,2,4,3,2,3,2,4,2,3,2,2,,,,,,,,,,4,4,4,4,4,5 +2023-03-17 7:57:13,2023-03-17 8:05:02,0,71.174.81.214,100,1000,1,2023-03-17T8:5:2,1015,2,1740505,16,4,5,7,,,,,,,,,,,,,,,,,,,,3,3,3,3,,,,,,,,,3,5,2,,,,,,2,1,3,2,4,4,4,3,3,2,2,3,4,3,5,5,5,4,3,4,2,4,5,3,1,3,2,,,,,,4,4,3,4,4,5,4,5,5,5 +2023-03-17 7:57:50,2023-03-17 8:02:53,0,71.174.81.214,100,1000,1,2023-03-17T8:2:54,1016,2,1740505,17,2,5,8,1,1,1,,,,,,,,,,,,,,,,,,,,,,,,,5,2,1,1,,,,,,,,,5,4,3,3,3,2,4,3,3,5,1,1,1,1,3,5,1,1,4,5,4,1,3,1,3,2,1,1,1,1,1,1,,,,,1,1,1,1,1,2 +2023-03-17 8:40:22,2023-03-17 8:53:19,0,71.174.81.214,100,1000,0,2023-03-18T8:53:20,1017,2,1740505,18,2,5,9,,,,,,,,,,,,,,,,,3,3,4,,,,,1,1,1,2,4,3,2,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,2,2,1,3,2,3,2,4,,,,,, +2023-03-17 8:37:13,2023-03-17 8:43:34,0,71.174.81.214,100,1000,1,2023-03-17T8:43:35,1018,2,1740505,19,2,2,10,,,,,,,,,,,,,,,,,,,,3,3,2,3,,,,,,,,,2,2,1,,,,,,4,4,4,4,4,4,5,4,5,3,4,3,3,4,5,4,,,3,2,3,4,1,4,2,3,4,,,,,,4,5,5,4,4,4,5,4,3,4 +2023-03-17 8:36:27,2023-03-17 8:44:07,0,71.174.81.214,100,1000,1,2023-03-17T8:44:8,1019,2,1740505,20,1,2,11,3,4,3,,,,,,,,,,2,3,3,2,4,5,4,3,4,3,5,3,4,4,5,4,5,3,4,5,4,4,,,,,,3,4,3,4,3,,,,,,,,,,,,,,,,,,,,,,,4,2,,2,4,4,5,2,4,5,5,4,3,3,4 +2023-03-17 8:33:55,2023-03-17 8:43:13,0,71.174.81.214,100,1000,1,2023-03-17T8:43:14,1020,2,1740505,21,1,5,12,3,1,3,,,,,,,,,,,,,,2,3,3,,4,,4,,4,,4,4,3,4,3,1,5,2,,,,,,3,4,2,3,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,4,5,5,5,4,,4,4,4 +2023-03-17 8:50:49,2023-03-17 9:13:18,0,71.174.81.214,100,1000,1,2023-03-17T9:13:19,1021,2,1740305,22,2,"5,2",1,,,,,,,,,,,,,2,3,3,3,2,3,3,,,,,3,4,4,4,,,,,,,,,,,,,2,2,1,3,2,5,5,4,5,4,2,2,2,1,1,1,1,3,3,4,3,4,4,1,1,1,3,,,,,,,,,,4,4,4,4,3,4 +2023-03-17 7:57:37,2023-03-17 8:04:25,0,71.174.81.214,100,1000,1,2023-03-17T8:4:25,1022,2,1740305,23,1,5,2,,,,,,,,,,,,,,,,,,,,4,4,4,5,,,,,,,,,4,4,5,,,,,,3,3,4,4,4,4,5,4,5,4,3,3,3,2,2,2,3,3,3,2,3,4,4,3,3,2,4,,,,,,3,2,4,3,3,3,3,4,2,2 +2023-03-17 8:01:47,2023-03-17 8:08:39,0,71.174.81.214,100,1000,1,2023-03-17T8:8:39,1023,2,1740305,24,1,"2,4",3,4,3,5,,,,,,,,,,2,2,2,2,1,2,2,4,3,2,3,2,3,3,4,3,4,2,3,4,3,2,,,,,,,,,,,5,4,3,5,3,1,2,2,2,2,4,,,,,,,,,,,,1,2,4,2,2,5,4,1,4,,,,,, +2023-03-17 8:37:21,2023-03-17 8:58:16,0,71.174.81.214,100,1000,1,2023-03-17T8:58:16,1024,2,1740305,25,1,5,4,3,3,3,,,,,,,,,,,,,,,,,,,,,,,,,4,4,2,3,,,,,,,,,3,4,4,3,2,5,5,4,5,3,3,2,3,4,3,3,2,4,4,3,3,3,2,2,3,3,4,3,3,3,3,3,,,,,4,3,2,3,3,3 +2023-03-17 8:02:25,2023-03-17 8:11:16,0,71.174.81.214,100,1000,0,2023-03-18T8:11:21,1025,2,1740305,26,2,5,5,,,,,,,,,,,,,,,,,3,3,3,4,4,4,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,,1,3,,,,,,4,3,3,3,3,,,,,,,,,, +2023-03-17 9:33:54,2023-03-17 9:57:21,0,71.174.81.214,100,1000,1,2023-03-17T9:57:22,1026,2,1740305,27,2,5,6,,,,,,,,,5,5,4,5,,,,,,,,1,2,2,2,,,,,,,,,1,1,2,5,4,4,4,3,5,4,,4,2,5,4,4,3,5,3,2,2,1,1,5,3,2,2,3,1,1,1,2,1,3,3,,,,,,,,,,,,,,, +2023-03-17 9:48:38,2023-03-17 9:58:45,0,71.174.81.214,100,1000,1,2023-03-17T9:58:45,1027,2,1740305,28,1,5,7,,,,,,,,,2,4,4,2,,,,,,,,3,3,3,5,,,,,,,,,2,3,3,4,3,2,2,3,1,3,2,3,2,3,3,3,2,4,3,4,2,2,1,1,5,5,2,3,2,3,5,4,3,2,2,,,,,,,,,,,,,,, +2023-03-17 8:36:40,2023-03-17 8:43:21,0,71.174.81.214,100,1000,1,2023-03-17T8:43:22,1028,2,1740305,29,2,5,8,,,,,,,,,,,,,,,,,,,,3,3,2,3,,,,,,,,,4,3,4,,,,,,3,3,2,4,3,5,4,4,5,4,4,2,3,4,2,5,4,4,3,3,3,3,2,2,3,1,4,,,,,,3,4,4,4,2,4,5,4,3,4 +2023-03-17 9:40:56,2023-03-17 9:52:52,0,71.174.81.214,100,1000,1,2023-03-17T9:52:52,1029,2,1740305,30,2,5,9,3,4,3,5,3,5,5,5,5,4,4,5,4,4,4,4,2,3,2,4,4,3,4,3,4,3,5,5,5,4,3,5,4,4,,,,,,,,,,,1,5,4,4,5,4,3,4,3,1,4,,,,,,,,,,,,,,,,,,,,,,,,,, +2023-03-17 9:33:58,2023-03-17 9:48:33,0,71.174.81.214,100,1000,1,2023-03-17T9:48:33,1030,2,1740305,31,2,5,10,,,,,,,,,4,3,5,2,,,,,,,,5,4,4,5,,,,,,,,,5,2,4,4,4,2,2,3,2,2,1,1,2,1,2,2,5,5,5,4,4,3,1,1,1,1,3,2,3,3,3,4,4,4,2,,,,,,,,,,,,,,, +2023-03-17 8:03:04,2023-03-17 8:23:33,0,71.174.81.214,100,1000,1,2023-03-17T8:23:33,1031,2,1740305,32,1,5,11,1,1,1,3,4,2,4,3,,,,,,,,,,,,,,,,,,,,5,5,5,5,,,,4,4,3,2,3,2,3,3,1,3,3,5,3,4,5,3,3,5,1,2,2,1,1,4,5,3,5,4,2,4,2,3,,,,,,,,,,,,,,, +2023-03-17 8:33:14,2023-03-17 8:41:01,0,71.174.81.214,100,1000,1,2023-03-17T8:41:2,1032,2,1740305,33,1,5,12,,,,,,,,,,,,,2,1,1,1,2,2,3,,,,,2,1,3,2,,,,,,,,,,,,,1,1,1,1,1,4,3,4,5,4,1,1,2,3,2,3,4,2,2,3,3,2,2,2,1,2,3,,,,,,,,,,3,2,2,1,2,2 +2023-03-17 7:57:06,2023-03-17 8:08:35,0,71.174.81.214,100,1000,1,2023-03-17T8:8:35,1033,2,1740505,34,2,5,9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,2,2,2,2,2,2,2,2,2 +2023-03-17 7:58:38,2023-03-17 8:12:04,0,71.174.81.214,100,1000,1,2023-03-17T8:12:5,1034,2,1740505,35,2,"5,4",12,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, diff --git a/spec/services/cleaner_spec.rb b/spec/services/cleaner_spec.rb index 4f3dc8b1..3c358e45 100644 --- a/spec/services/cleaner_spec.rb +++ b/spec/services/cleaner_spec.rb @@ -1,29 +1,29 @@ -require 'rails_helper' -require 'fileutils' +require "rails_helper" +require "fileutils" RSpec.describe Cleaner do - let(:district) { create(:district, name: 'District1') } - let(:second_district) { create(:district, name: 'District2') } + let(:district) { create(:district, name: "District1") } + let(:second_district) { create(:district, name: "District2") } let(:school) { create(:school, dese_id: 1_740_505, district:) } let(:second_school) { create(:school, dese_id: 222_222, district: second_district) } - let(:academic_year) { create(:academic_year, range: '2022-23') } + let(:academic_year) { create(:academic_year, range: "2022-23") } let(:respondents) { create(:respondent, school:, academic_year:, nine: 40, ten: 40, eleven: 40, twelve: 40) } - let(:recorded_date) { '2023-04-01' } + let(:recorded_date) { "2023-04-01" } let(:input_filepath) do - Rails.root.join('spec', 'fixtures', 'raw') + Rails.root.join("spec", "fixtures", "raw") end let(:output_filepath) do - Rails.root.join('tmp', 'spec', 'clean') + Rails.root.join("tmp", "spec", "clean") end let(:log_filepath) do - Rails.root.join('tmp', 'spec', 'removed') + Rails.root.join("tmp", "spec", "removed") end let(:common_headers) do - ['Recorded Date', 'Dese ID', 'ResponseID'] + ["Recorded Date", "Dese ID", "ResponseID"] end let(:standard_survey_items) do @@ -41,16 +41,16 @@ RSpec.describe Cleaner do end let(:short_form_survey_items) do - ([create(:survey_item, survey_item_id: 's-phys-q1', on_short_form: true), - create(:survey_item, survey_item_id: 's-phys-q2', on_short_form: true), - create(:survey_item, survey_item_id: 's-phys-q3', + ([create(:survey_item, survey_item_id: "s-phys-q1", on_short_form: true), + create(:survey_item, survey_item_id: "s-phys-q2", on_short_form: true), + create(:survey_item, survey_item_id: "s-phys-q3", on_short_form: true)].map(&:survey_item_id) << common_headers).flatten end let(:early_education_survey_items) do - ([create(:survey_item, survey_item_id: 's-emsa-es1'), - create(:survey_item, survey_item_id: 's-emsa-es2'), - create(:survey_item, survey_item_id: 's-emsa-es3')].map(&:survey_item_id) << common_headers).flatten + ([create(:survey_item, survey_item_id: "s-emsa-es1"), + create(:survey_item, survey_item_id: "s-emsa-es2"), + create(:survey_item, survey_item_id: "s-emsa-es3")].map(&:survey_item_id) << common_headers).flatten end let(:teacher_survey_items) do @@ -79,84 +79,232 @@ RSpec.describe Cleaner do respondents end - context 'Creating a new Cleaner' do - it 'creates a directory for the clean data' do + context "Creating a new Cleaner" do + it "creates a directory for the clean data" do Cleaner.new(input_filepath:, output_filepath:, log_filepath:).clean expect(output_filepath).to exist end - it 'creates a directory for the removed data' do + it "creates a directory for the removed data" do Cleaner.new(input_filepath:, output_filepath:, log_filepath:).clean expect(log_filepath).to exist end end - context '.filename' do - context 'defines a filename in the format: [district].[early_ed/short_form/standard/teacher].[year as 2022-23]' do - context 'when the file is based on standard survey items' do - it 'adds the survey type as standard to the filename' do + context ".process_raw_file" do + it "sorts data into valid and invalid csvs" do + cleaner = Cleaner.new(input_filepath:, output_filepath:, log_filepath:, disaggregation_filepath:) + processed_data = cleaner.process_raw_file( + file: path_to_sample_raw_file, disaggregation_data: cleaner.disaggregation_data + ) + processed_data in [headers, clean_csv, log_csv, data] + + reads_headers_from_raw_csv(processed_data) + + valid_rows = %w[1000 1001 1004 1005 1008 1017 1018 1019 1020 1024 1025 1026 + 1027 1028] + valid_rows.each do |response_id| + valid_row = data.find { |row| row.response_id == response_id } + expect(valid_row.valid?).to eq true + end + + invalid_rows = %w[1002 1003 1006 1007 1009 1010 1011 1012 1013 1014 1015 1016 1021 1022 1023 1029 1030 1031 1032 + 1033 1034] + invalid_rows.each do |response_id| + invalid_row = data.find { |row| row.response_id == response_id } + expect(invalid_row.valid?).to eq false + end + + expect(clean_csv.length).to eq valid_rows.length + 1 # headers + rows + expect(log_csv.length).to eq invalid_rows.length + 1 # headers + rows + + csv_contains_the_correct_rows(clean_csv, valid_rows) + csv_contains_the_correct_rows(log_csv, invalid_rows) + invalid_rows_are_rejected_for_the_correct_reasons(data) + end + + it "adds dissaggregation data to the cleaned file " do + cleaner = Cleaner.new(input_filepath:, output_filepath:, log_filepath:, disaggregation_filepath:) + processed_data = cleaner.process_raw_file( + file: path_to_sample_raw_file, disaggregation_data: cleaner.disaggregation_data + ) + processed_data in [headers, clean_csv, log_csv, data] + index_of_income = clean_csv.first.index("Income") + expect(clean_csv.second[index_of_income]).to eq "Economically Disadvantaged - Y" + + one_thousand = data.find { |row| row.response_id == "1000" } + expect(one_thousand.income).to eq "Economically Disadvantaged - Y" + + one_thousand_one = data.find { |row| row.response_id == "1001" } + expect(one_thousand_one.income).to eq "Economically Disadvantaged - N" + end + end + + context ".filename" do + context "defines a filename in the format: [district].[early_ed/short_form/standard/teacher].[year as 2022-23]" do + context "when the file is based on standard survey items" do + it "adds the survey type as standard to the filename" do survey_items = SurveyItem.where(survey_item_id: standard_survey_items) - data = [SurveyItemValues.new(row: { 'Recorded Date' => recorded_date, 'Dese ID' => '1_740_505' }, headers: standard_survey_items, genders: nil, survey_items:, + data = [SurveyItemValues.new(row: { "Recorded Date" => recorded_date, "Dese ID" => "1_740_505" }, headers: standard_survey_items, genders: nil, survey_items:, schools: School.school_hash)] filename = Cleaner.new(input_filepath:, output_filepath:, log_filepath:).filename( headers: standard_survey_items, data: ) - expect(filename).to eq 'District1.standard.2022-23.csv' + expect(filename).to eq "District1.standard.2022-23.csv" end - context 'when the file is based on short form survey items' do - it 'adds the survey type as short form to the filename' do + context "when the file is based on short form survey items" do + it "adds the survey type as short form to the filename" do survey_items = SurveyItem.where(survey_item_id: short_form_survey_items) - data = [SurveyItemValues.new(row: { 'Recorded Date' => recorded_date, 'Dese ID' => '1_740_505' }, headers: short_form_survey_items, genders: nil, survey_items:, + data = [SurveyItemValues.new(row: { "Recorded Date" => recorded_date, "Dese ID" => "1_740_505" }, headers: short_form_survey_items, genders: nil, survey_items:, schools: School.school_hash)] filename = Cleaner.new(input_filepath:, output_filepath:, log_filepath:).filename( headers: short_form_survey_items, data: ) - expect(filename).to eq 'District1.short_form.2022-23.csv' + expect(filename).to eq "District1.short_form.2022-23.csv" end end - context 'when the file is based on early education survey items' do - it 'adds the survey type as early education to the filename' do + context "when the file is based on early education survey items" do + it "adds the survey type as early education to the filename" do survey_items = SurveyItem.where(survey_item_id: early_education_survey_items) - data = [SurveyItemValues.new(row: { 'Recorded Date' => recorded_date, 'Dese ID' => '1_740_505' }, headers: early_education_survey_items, genders: nil, survey_items:, + data = [SurveyItemValues.new(row: { "Recorded Date" => recorded_date, "Dese ID" => "1_740_505" }, headers: early_education_survey_items, genders: nil, survey_items:, schools: School.school_hash)] filename = Cleaner.new(input_filepath:, output_filepath:, log_filepath:).filename( headers: early_education_survey_items, data: ) - expect(filename).to eq 'District1.early_education.2022-23.csv' + expect(filename).to eq "District1.early_education.2022-23.csv" end end - context 'when the file is based on teacher survey items' do - it 'adds the survey type as teacher to the filename' do + context "when the file is based on teacher survey items" do + it "adds the survey type as teacher to the filename" do survey_items = SurveyItem.where(survey_item_id: teacher_survey_items) - data = [SurveyItemValues.new(row: { 'Recorded Date' => recorded_date, 'Dese ID' => '1_740_505' }, headers: teacher_survey_items, genders: nil, survey_items:, + data = [SurveyItemValues.new(row: { "Recorded Date" => recorded_date, "Dese ID" => "1_740_505" }, headers: teacher_survey_items, genders: nil, survey_items:, schools: School.school_hash)] filename = Cleaner.new(input_filepath:, output_filepath:, log_filepath:).filename( headers: teacher_survey_items, data: ) - expect(filename).to eq 'District1.teacher.2022-23.csv' + expect(filename).to eq "District1.teacher.2022-23.csv" end end - context 'when there is more than one district' do - it 'adds all districts to the filename' do + context "when there is more than one district" do + it "adds all districts to the filename" do survey_items = SurveyItem.where(survey_item_id: teacher_survey_items) - data = [SurveyItemValues.new(row: { 'Recorded Date' => recorded_date, 'Dese ID' => '1_740_505' }, headers: teacher_survey_items, genders: nil, survey_items:, schools: School.school_hash), - SurveyItemValues.new(row: { 'Recorded Date' => recorded_date, 'Dese ID' => '222_222' }, + data = [SurveyItemValues.new(row: { "Recorded Date" => recorded_date, "Dese ID" => "1_740_505" }, headers: teacher_survey_items, genders: nil, survey_items:, schools: School.school_hash), + SurveyItemValues.new(row: { "Recorded Date" => recorded_date, "Dese ID" => "222_222" }, headers: teacher_survey_items, genders: nil, survey_items:, schools: School.school_hash)] filename = Cleaner.new(input_filepath:, output_filepath:, log_filepath:).filename( headers: teacher_survey_items, data: ) - expect(filename).to eq 'District1.District2.teacher.2022-23.csv' + expect(filename).to eq "District1.District2.teacher.2022-23.csv" end end end end end end + +def reads_headers_from_raw_csv(processed_data) + processed_data in [headers, clean_csv, log_csv, data] + expect(headers.to_set.sort).to eq ["StartDate", "EndDate", "Status", "IPAddress", "Progress", "Duration (in seconds)", + "Finished", "RecordedDate", "ResponseId", "District", "School", + "LASID", "Gender", "Race", "What grade are you in?", "s-emsa-q1", "s-emsa-q2", "s-emsa-q3", "s-tint-q1", + "s-tint-q2", "s-tint-q3", "s-tint-q4", "s-tint-q5", "s-acpr-q1", "s-acpr-q2", + "s-acpr-q3", "s-acpr-q4", "s-cure-q1", "s-cure-q2", "s-cure-q3", "s-cure-q4", "s-sten-q1", "s-sten-q2", + "s-sten-q3", "s-sper-q1", "s-sper-q2", "s-sper-q3", "s-sper-q4", "s-civp-q1", "s-civp-q2", "s-civp-q3", + "s-civp-q4", "s-grmi-q1", "s-grmi-q2", "s-grmi-q3", "s-grmi-q4", "s-appa-q1", "s-appa-q2", "s-appa-q3", + "s-peff-q1", "s-peff-q2", "s-peff-q3", "s-peff-q4", "s-peff-q5", "s-peff-q6", "s-sbel-q1", "s-sbel-q2", + "s-sbel-q3", "s-sbel-q4", "s-sbel-q5", "s-phys-q1", "s-phys-q2", "s-phys-q3", "s-phys-q4", "s-vale-q1", + "s-vale-q2", "s-vale-q3", "s-vale-q4", "s-acst-q1", "s-acst-q2", "s-acst-q3", "s-sust-q1", "s-sust-q2", + "s-grit-q1", "s-grit-q2", "s-grit-q3", "s-grit-q4", "s-expa-q1", "s-poaf-q1", "s-poaf-q2", "s-poaf-q3", + "s-poaf-q4", "s-tint-q1-1", "s-tint-q2-1", "s-tint-q3-1", "s-tint-q4-1", "s-tint-q5-1", "s-acpr-q1-1", + "s-acpr-q2-1", "s-acpr-q3-1", "s-acpr-q4-1", "s-peff-q1-1", "s-peff-q2-1", "s-peff-q3-1", "s-peff-q4-1", + "s-peff-q5-1", "s-peff-q6-1", "Raw Income", "Income"].to_set.sort +end + +def invalid_rows_are_rejected_for_the_correct_reasons(data) + one_thousand_two = data.find { |row| row.response_id == "1002" } + expect(one_thousand_two.valid_progress?).to eq false + expect(one_thousand_two.valid_duration?).to eq true + expect(one_thousand_two.valid_grade?).to eq true + expect(one_thousand_two.valid_sd?).to eq true + + one_thousand_three = data.find { |row| row.response_id == "1003" } + expect(one_thousand_three.valid_progress?).to eq false + expect(one_thousand_three.valid_duration?).to eq true + expect(one_thousand_three.valid_grade?).to eq true + expect(one_thousand_three.valid_sd?).to eq true + + one_thousand_six = data.find { |row| row.response_id == "1006" } + expect(one_thousand_six.valid_progress?).to eq true + expect(one_thousand_six.valid_duration?).to eq false + expect(one_thousand_six.valid_grade?).to eq true + expect(one_thousand_six.valid_sd?).to eq true + + one_thousand_seven = data.find { |row| row.response_id == "1007" } + expect(one_thousand_seven.valid_progress?).to eq true + expect(one_thousand_seven.valid_duration?).to eq false + expect(one_thousand_seven.valid_grade?).to eq true + expect(one_thousand_seven.valid_sd?).to eq true + + one_thousand_seven = data.find { |row| row.response_id == "1007" } + expect(one_thousand_seven.valid_progress?).to eq true + expect(one_thousand_seven.valid_duration?).to eq false + expect(one_thousand_seven.valid_grade?).to eq true + expect(one_thousand_seven.valid_sd?).to eq true + + one_thousand_nine = data.find { |row| row.response_id == "1009" } + expect(one_thousand_nine.valid_progress?).to eq true + expect(one_thousand_nine.valid_duration?).to eq true + expect(one_thousand_nine.valid_grade?).to eq false + expect(one_thousand_nine.valid_sd?).to eq true + + one_thousand_ten = data.find { |row| row.response_id == "1010" } + expect(one_thousand_ten.valid_progress?).to eq true + expect(one_thousand_ten.valid_duration?).to eq true + expect(one_thousand_ten.valid_grade?).to eq false + expect(one_thousand_ten.valid_sd?).to eq true + + one_thousand_eleven = data.find { |row| row.response_id == "1011" } + expect(one_thousand_eleven.valid_progress?).to eq true + expect(one_thousand_eleven.valid_duration?).to eq true + expect(one_thousand_eleven.valid_grade?).to eq false + expect(one_thousand_eleven.valid_sd?).to eq true + + one_thousand_twenty_two = data.find { |row| row.response_id == "1022" } + expect(one_thousand_twenty_two.valid_progress?).to eq true + expect(one_thousand_twenty_two.valid_duration?).to eq true + expect(one_thousand_twenty_two.valid_grade?).to eq false + expect(one_thousand_twenty_two.valid_sd?).to eq true + + one_thousand_twenty_three = data.find { |row| row.response_id == "1023" } + expect(one_thousand_twenty_three.valid_progress?).to eq true + expect(one_thousand_twenty_three.valid_duration?).to eq true + expect(one_thousand_twenty_three.valid_grade?).to eq false + expect(one_thousand_twenty_three.valid_sd?).to eq true + + one_thousand_thirty_three = data.find { |row| row.response_id == "1033" } + expect(one_thousand_thirty_three.valid_progress?).to eq true + expect(one_thousand_thirty_three.valid_duration?).to eq true + expect(one_thousand_thirty_three.valid_grade?).to eq true + expect(one_thousand_thirty_three.valid_sd?).to eq false + + one_thousand_thirty_four = data.find { |row| row.response_id == "1034" } + expect(one_thousand_thirty_four.valid_progress?).to eq true + expect(one_thousand_thirty_four.valid_duration?).to eq true + expect(one_thousand_thirty_four.valid_grade?).to eq true + expect(one_thousand_thirty_four.valid_sd?).to eq false +end + +def csv_contains_the_correct_rows(csv, rows) + rows.each_with_index do |row, index| + response_id = 8 # eigth column + expect(csv[index + 1][response_id]).to eq row + end +end