require "rails_helper" require "fileutils" RSpec.describe Cleaner do let(:district) { create(:district, name: "Maynard Public Schools") } let(:second_district) { create(:district, name: "District2") } let(:school) { create(:school, dese_id: 1_740_505, district:, name: "Maynard High School") } let(:second_school) { create(:school, dese_id: 1_740_305, district:) } let(:third_school) { create(:school, dese_id: 222_222, district: second_district) } let(:academic_year) { create(:academic_year, range: "2022-23") } let(:respondents) do create(:respondent, school:, academic_year:, one: 0, nine: 40, ten: 40, eleven: 40, twelve: 40) create(:respondent, school: second_school, academic_year:, one: 0, four: 40, five: 40, six: 40, seven: 40, eight: 40) end let(:recorded_date) { "4/1/2023" } let(:input_filepath) do Rails.root.join("spec", "fixtures", "raw") end let(:output_filepath) do Rails.root.join("tmp", "spec", "clean") end let(:log_filepath) do Rails.root.join("tmp", "spec", "removed") end let(:path_to_sample_raw_file) do File.open(Rails.root.join("spec", "fixtures", "raw", "sample_maynard_raw_student_survey.csv")) end let(:path_to_file_with_duplicate_headers) do File.open(Rails.root.join("spec", "fixtures", "raw", "sample_file_with_duplicate_headers.csv")) end let(:common_headers) do ["Recorded Date", "Dese ID", "ResponseID"] end let(:standard_survey_items) do survey_item_ids = (%w[s-peff-q1 s-peff-q2 s-peff-q3 s-peff-q4 s-peff-q5 s-peff-q6 s-phys-q1 s-phys-q2 s-phys-q3 s-phys-q4 s-emsa-q1 s-emsa-q2 s-emsa-q3 s-sbel-q1 s-sbel-q2 s-sbel-q3 s-sbel-q4 s-sbel-q5 s-tint-q1 s-tint-q2 s-tint-q3 s-tint-q4 s-tint-q5 s-vale-q1 s-vale-q2 s-vale-q3 s-vale-q4 s-acpr-q1 s-acpr-q2 s-acpr-q3 s-acpr-q4 s-sust-q1 s-sust-q2 s-cure-q1 s-cure-q2 s-cure-q3 s-cure-q4 s-sten-q1 s-sten-q2 s-sten-q3 s-sper-q1 s-sper-q2 s-sper-q3 s-sper-q4 s-civp-q1 s-civp-q2 s-civp-q3 s-civp-q4 s-grit-q1 s-grit-q2 s-grit-q3 s-grit-q4 s-grmi-q1 s-grmi-q2 s-grmi-q3 s-grmi-q4 s-expa-q1 s-appa-q1 s-appa-q2 s-appa-q3 s-acst-q1 s-acst-q2 s-acst-q3 s-poaf-q1 s-poaf-q2 s-poaf-q3 s-poaf-q4] << common_headers).flatten survey_item_ids.map do |survey_item_id| create(:survey_item, survey_item_id:) end survey_item_ids end let(:short_form_survey_items) do ([create(:survey_item, survey_item_id: "s-phys-q1", on_short_form: true), create(:survey_item, survey_item_id: "s-phys-q2", on_short_form: true), create(:survey_item, survey_item_id: "s-phys-q3", on_short_form: true)].map(&:survey_item_id) << common_headers).flatten end let(:early_education_survey_items) do ([create(:survey_item, survey_item_id: "s-emsa-es1"), create(:survey_item, survey_item_id: "s-emsa-es2"), create(:survey_item, survey_item_id: "s-emsa-es3")].map(&:survey_item_id) << common_headers).flatten end let(:teacher_survey_items) do survey_item_ids = (%w[t-prep-q1 t-prep-q2 t-prep-q3 t-ieff-q1 t-ieff-q2 t-ieff-q3 t-ieff-q4 t-pcom-q1 t-pcom-q2 t-pcom-q3 t-pcom-q4 t-pcom-q5 t-inle-q1 t-inle-q2 t-inle-q3 t-prtr-q1 t-prtr-q2 t-prtr-q3 t-coll-q1 t-coll-q2 t-coll-q3 t-qupd-q1 t-qupd-q2 t-qupd-q3 t-qupd-q4 t-pvic-q1 t-pvic-q2 t-pvic-q3 t-psup-q1 t-psup-q2 t-psup-q3 t-psup-q4 t-acch-q1 t-acch-q2 t-acch-q3 t-reso-q1 t-reso-q2 t-reso-q3 t-reso-q4 t-reso-q5 t-sust-q1 t-sust-q2 t-sust-q3 t-sust-q4 t-curv-q1 t-curv-q2 t-curv-q3 t-curv-q4 t-cure-q1 t-cure-q2 t-cure-q3 t-cure-q4 t-peng-q1 t-peng-q2 t-peng-q3 t-peng-q4 t-ceng-q1 t-ceng-q2 t-ceng-q3 t-ceng-q4 t-sach-q1 t-sach-q2 t-sach-q3 t-psol-q1 t-psol-q2 t-psol-q3 t-expa-q2 t-expa-q3 t-phya-q2 t-phya-q3] << common_headers).flatten survey_item_ids.map do |survey_item_id| create(:survey_item, survey_item_id:) end survey_item_ids end before :each do school second_school third_school standard_survey_items short_form_survey_items early_education_survey_items teacher_survey_items academic_year respondents end context "When duplicate headers exist" do it "outputs a message to stdout" do output = capture_stdout { Cleaner.new(input_filepath:, output_filepath:, log_filepath:).clean } expect(output).to match "\n>>>>>>>>>>>>>>>>>> Duplicate header found. This will misalign column headings. Please delete or rename the duplicate column: StartDate \n>>>>>>>>>>>>>> \n" end end context "Creating a new Cleaner" do it "creates a directory for the clean data" do Cleaner.new(input_filepath:, output_filepath:, log_filepath:).clean expect(output_filepath).to exist end it "creates a directory for the removed data" do Cleaner.new(input_filepath:, output_filepath:, log_filepath:).clean expect(log_filepath).to exist end end context ".process_raw_file" do it "sorts data into valid and invalid csvs" do cleaner = Cleaner.new(input_filepath:, output_filepath:, log_filepath:) processed_data = cleaner.process_raw_file( file: path_to_sample_raw_file ) processed_data in [headers, clean_csv, log_csv, data] reads_headers_from_raw_csv(processed_data) valid_rows = %w[1000 1001 1003 1004 1005 1008 1017 1018 1019 1020 1024 1026 1027 1028] valid_rows.each do |response_id| valid_row = data.find { |row| row.response_id == response_id } expect(valid_row.valid?).to eq true end invalid_rows = %w[1002 1006 1007 1009 1010 1011 1012 1013 1014 1015 1016 1021 1022 1023 1025 1029 1030 1031 1032 1033 1034] invalid_rows.each do |response_id| invalid_row = data.find { |row| row.response_id == response_id } expect(invalid_row.valid?).to eq false end expect(clean_csv.length).to eq valid_rows.length + 1 # headers + rows expect(log_csv.length).to eq invalid_rows.length + 1 # headers + rows csv_contains_the_correct_rows(clean_csv, valid_rows) csv_contains_the_correct_rows(log_csv, invalid_rows) invalid_rows_are_rejected_for_the_correct_reasons(data) end end context ".filename" do context "defines a filename in the format: [district].[early_ed/short_form/standard/teacher].[year as 2022-23]" do context "when the file is based on standard survey items" do it "adds the survey type as standard to the filename" do survey_items = SurveyItem.where(survey_item_id: standard_survey_items) data = [SurveyItemValues.new(row: { "Recorded Date" => recorded_date, "Dese ID" => "1_740_505" }, headers: standard_survey_items, survey_items:, schools: School.by_dese_id)] filename = Cleaner.new(input_filepath:, output_filepath:, log_filepath:).filename( headers: standard_survey_items, data:, filepath: nil ) expect(filename).to eq "maynard.maynard-high-school.standard.2022-23.csv" end context "when the file is based on short form survey items" do it "adds the survey type as short form to the filename" do survey_items = SurveyItem.where(survey_item_id: short_form_survey_items) data = [SurveyItemValues.new(row: { "Recorded Date" => recorded_date, "Dese ID" => "1_740_505" }, headers: short_form_survey_items, survey_items:, schools: School.by_dese_id)] filename = Cleaner.new(input_filepath:, output_filepath:, log_filepath:).filename( headers: short_form_survey_items, data:, filepath: nil ) expect(filename).to eq "maynard.maynard-high-school.short_form.2022-23.csv" end end context "when the file is based on early education survey items" do it "adds the survey type as early education to the filename" do survey_items = SurveyItem.where(survey_item_id: early_education_survey_items) data = [SurveyItemValues.new(row: { "Recorded Date" => recorded_date, "Dese ID" => "1_740_505" }, headers: early_education_survey_items, survey_items:, schools: School.by_dese_id)] filename = Cleaner.new(input_filepath:, output_filepath:, log_filepath:).filename( headers: early_education_survey_items, data:, filepath: nil ) expect(filename).to eq "maynard.maynard-high-school.early_education.2022-23.csv" end end context "when the file is based on teacher survey items" do it "adds the survey type as teacher to the filename" do survey_items = SurveyItem.where(survey_item_id: teacher_survey_items) data = [SurveyItemValues.new(row: { "Recorded Date" => recorded_date, "Dese ID" => "1_740_505" }, headers: teacher_survey_items, survey_items:, schools: School.by_dese_id)] filename = Cleaner.new(input_filepath:, output_filepath:, log_filepath:).filename( headers: teacher_survey_items, data:, filepath: nil ) expect(filename).to eq "maynard.maynard-high-school.teacher.2022-23.csv" end end context "when there is more than one district" do it "adds all districts to the filename" do survey_items = SurveyItem.where(survey_item_id: teacher_survey_items) data = [SurveyItemValues.new(row: { "Recorded Date" => recorded_date, "Dese ID" => "1_740_505" }, headers: teacher_survey_items, survey_items:, schools: School.by_dese_id), SurveyItemValues.new(row: { "Recorded Date" => recorded_date, "Dese ID" => "222_222" }, headers: teacher_survey_items, survey_items:, schools: School.by_dese_id)] filename = Cleaner.new(input_filepath:, output_filepath:, log_filepath:).filename( headers: teacher_survey_items, data:, filepath: nil ) expect(filename).to eq "district2.maynard.teacher.2022-23.csv" end end context "when the file name includes the words 'part'" do it "adds the part to the filename" do survey_items = SurveyItem.where(survey_item_id: early_education_survey_items) data = [SurveyItemValues.new(row: { "Recorded Date" => recorded_date, "Dese ID" => "1_740_505" }, headers: early_education_survey_items, survey_items:, schools: School.by_dese_id)] filename = Cleaner.new(input_filepath:, output_filepath:, log_filepath:).filename( headers: early_education_survey_items, data:, filepath: "/data/survey_responses/maynard early ed_ part a.2022-23.csv" ) expect(filename).to eq "maynard.maynard-high-school.early_education.Part-A.2022-23.csv" end end end end end end def reads_headers_from_raw_csv(processed_data) processed_data in [headers, clean_csv, log_csv, data] expect(headers.to_set.sort).to eq ["StartDate", "EndDate", "Status", "IPAddress", "Progress", "Duration (in seconds)", "Finished", "RecordedDate", "ResponseId", "District", "School", "LASID", "Gender", "Race", "What grade are you in?", "s-emsa-q1", "s-emsa-q2", "s-emsa-q3", "s-tint-q1", "s-tint-q2", "s-tint-q3", "s-tint-q4", "s-tint-q5", "s-acpr-q1", "s-acpr-q2", "s-acpr-q3", "s-acpr-q4", "s-cure-q1", "s-cure-q2", "s-cure-q3", "s-cure-q4", "s-sten-q1", "s-sten-q2", "s-sten-q3", "s-sper-q1", "s-sper-q2", "s-sper-q3", "s-sper-q4", "s-civp-q1", "s-civp-q2", "s-civp-q3", "s-civp-q4", "s-grmi-q1", "s-grmi-q2", "s-grmi-q3", "s-grmi-q4", "s-appa-q1", "s-appa-q2", "s-appa-q3", "s-peff-q1", "s-peff-q2", "s-peff-q3", "s-peff-q4", "s-peff-q5", "s-peff-q6", "s-sbel-q1", "s-sbel-q2", "s-sbel-q3", "s-sbel-q4", "s-sbel-q5", "s-phys-q1", "s-phys-q2", "s-phys-q3", "s-phys-q4", "s-vale-q1", "s-vale-q2", "s-vale-q3", "s-vale-q4", "s-acst-q1", "s-acst-q2", "s-acst-q3", "s-sust-q1", "s-sust-q2", "s-grit-q1", "s-grit-q2", "s-grit-q3", "s-grit-q4", "s-expa-q1", "s-poaf-q1", "s-poaf-q2", "s-poaf-q3", "s-poaf-q4", "s-tint-q1-1", "s-tint-q2-1", "s-tint-q3-1", "s-tint-q4-1", "s-tint-q5-1", "s-acpr-q1-1", "s-acpr-q2-1", "s-acpr-q3-1", "s-acpr-q4-1", "s-peff-q1-1", "s-peff-q2-1", "s-peff-q3-1", "s-peff-q4-1", "s-peff-q5-1", "s-peff-q6-1", "Raw Income", "Income", "Raw ELL", "ELL", "Raw SpEd", "SpEd", "Progress Count"].to_set.sort end def invalid_rows_are_rejected_for_the_correct_reasons(data) one_thousand_two = data.find { |row| row.response_id == "1002" } expect(one_thousand_two.valid_progress?).to eq false expect(one_thousand_two.valid_duration?).to eq true expect(one_thousand_two.valid_grade?).to eq true expect(one_thousand_two.valid_sd?).to eq true one_thousand_three = data.find { |row| row.response_id == "1003" } expect(one_thousand_three.valid_progress?).to eq true expect(one_thousand_three.valid_duration?).to eq true expect(one_thousand_three.valid_grade?).to eq true expect(one_thousand_three.valid_sd?).to eq true one_thousand_six = data.find { |row| row.response_id == "1006" } expect(one_thousand_six.valid_progress?).to eq true expect(one_thousand_six.valid_duration?).to eq false expect(one_thousand_six.valid_grade?).to eq true expect(one_thousand_six.valid_sd?).to eq true one_thousand_seven = data.find { |row| row.response_id == "1007" } expect(one_thousand_seven.valid_progress?).to eq true expect(one_thousand_seven.valid_duration?).to eq false expect(one_thousand_seven.valid_grade?).to eq true expect(one_thousand_seven.valid_sd?).to eq true one_thousand_seven = data.find { |row| row.response_id == "1007" } expect(one_thousand_seven.valid_progress?).to eq true expect(one_thousand_seven.valid_duration?).to eq false expect(one_thousand_seven.valid_grade?).to eq true expect(one_thousand_seven.valid_sd?).to eq true one_thousand_nine = data.find { |row| row.response_id == "1009" } expect(one_thousand_nine.valid_progress?).to eq true expect(one_thousand_nine.valid_duration?).to eq true expect(one_thousand_nine.valid_grade?).to eq false expect(one_thousand_nine.valid_sd?).to eq true one_thousand_ten = data.find { |row| row.response_id == "1010" } expect(one_thousand_ten.valid_progress?).to eq true expect(one_thousand_ten.valid_duration?).to eq true expect(one_thousand_ten.valid_grade?).to eq false expect(one_thousand_ten.valid_sd?).to eq true one_thousand_eleven = data.find { |row| row.response_id == "1011" } expect(one_thousand_eleven.valid_progress?).to eq true expect(one_thousand_eleven.valid_duration?).to eq true expect(one_thousand_eleven.valid_grade?).to eq false expect(one_thousand_eleven.valid_sd?).to eq true one_thousand_twenty_two = data.find { |row| row.response_id == "1022" } expect(one_thousand_twenty_two.valid_progress?).to eq true expect(one_thousand_twenty_two.valid_duration?).to eq true expect(one_thousand_twenty_two.valid_grade?).to eq false expect(one_thousand_twenty_two.valid_sd?).to eq true one_thousand_twenty_three = data.find { |row| row.response_id == "1023" } expect(one_thousand_twenty_three.valid_progress?).to eq true expect(one_thousand_twenty_three.valid_duration?).to eq true expect(one_thousand_twenty_three.valid_grade?).to eq false expect(one_thousand_twenty_three.valid_sd?).to eq true one_thousand_thirty_three = data.find { |row| row.response_id == "1033" } expect(one_thousand_thirty_three.valid_progress?).to eq false expect(one_thousand_thirty_three.valid_duration?).to eq true expect(one_thousand_thirty_three.valid_grade?).to eq true expect(one_thousand_thirty_three.valid_sd?).to eq false one_thousand_thirty_four = data.find { |row| row.response_id == "1034" } expect(one_thousand_thirty_four.valid_progress?).to eq false expect(one_thousand_thirty_four.valid_duration?).to eq true expect(one_thousand_thirty_four.valid_grade?).to eq true expect(one_thousand_thirty_four.valid_sd?).to eq false end def csv_contains_the_correct_rows(csv, rows) rows.each_with_index do |row, index| response_id = 8 # eigth column expect(csv[index + 1][response_id]).to eq row end end