mirror of
https://github.com/edcommonwealth/sqm-dashboards.git
synced 2026-03-08 23:18:18 -07:00
initialized with a season, i.e. "2024-25 Fall". Update scapers for admin data, enrollment and staffing to use the new range standard correctly. Update the loaders for admin data, enrollment and staffing so that it populates all seasons in a given year. So admin data for 2024-25 gets loaded into "2024-25 Fall" and "2024-25 Spring". Add tests for the new range format. Set the default cutoff for the start of Spring season will be the last Sunday in February
148 lines
4.6 KiB
Ruby
148 lines
4.6 KiB
Ruby
require "fileutils"
|
|
class Cleaner
|
|
attr_reader :input_filepath, :output_filepath, :log_filepath
|
|
|
|
def initialize(input_filepath:, output_filepath:, log_filepath:)
|
|
@input_filepath = input_filepath
|
|
@output_filepath = output_filepath
|
|
@log_filepath = log_filepath
|
|
initialize_directories
|
|
end
|
|
|
|
def clean
|
|
Dir.glob(Rails.root.join(input_filepath, "*.csv")).each do |filepath|
|
|
puts filepath
|
|
File.open(filepath) do |file|
|
|
processed_data = process_raw_file(file:)
|
|
processed_data in [headers, clean_csv, log_csv, data]
|
|
return if data.empty?
|
|
|
|
filename = filename(headers:, data:, filepath:)
|
|
write_csv(data: clean_csv, output_filepath:, filename:)
|
|
write_csv(data: log_csv, output_filepath: log_filepath, prefix: "removed.", filename:)
|
|
end
|
|
end
|
|
end
|
|
|
|
def filename(headers:, data:, filepath:)
|
|
output = []
|
|
survey_item_ids = headers.filter(&:present?).filter do |header|
|
|
header.start_with?("s-", "t-")
|
|
end.reject { |item| item.end_with? "-1" }
|
|
survey_type = SurveyItem.survey_type(survey_item_ids:)
|
|
range = data.first.academic_year.range
|
|
|
|
districts = data.map do |row|
|
|
row.district.short_name
|
|
end.to_set.to_a
|
|
|
|
schools = data.map do |row|
|
|
row.school.name
|
|
end.to_set
|
|
|
|
part = filepath&.match(/[\b\s_.]+(part)[\W*_](?<label>[\w\d])/i)&.named_captures&.[]("label")&.upcase
|
|
|
|
school_name = schools.first.parameterize
|
|
|
|
output << districts.sort.join(".")
|
|
output << school_name if schools.length == 1
|
|
output << survey_type.to_s
|
|
output << "Part-" + part unless part.nil?
|
|
output << range.parameterize
|
|
output << "csv"
|
|
output.join(".")
|
|
end
|
|
|
|
def process_raw_file(file:)
|
|
clean_csv = []
|
|
log_csv = []
|
|
data = []
|
|
headers = CSV.parse(file.first).first
|
|
duplicate_header = headers.detect { |header| headers.count(header) > 1 }
|
|
unless duplicate_header.nil?
|
|
puts "\n>>>>>>>>>>>>>>>>>> Duplicate header found. This will misalign column headings. Please delete or rename the duplicate column: #{duplicate_header} \n>>>>>>>>>>>>>> \n"
|
|
end
|
|
headers = headers.to_set
|
|
headers = headers.merge(Set.new(["Raw Income", "Income", "Raw ELL", "ELL", "Raw SpEd", "SpEd", "Progress Count",
|
|
"Race", "Gender"])).to_a
|
|
filtered_headers = include_all_headers(headers:)
|
|
filtered_headers = remove_unwanted_headers(headers: filtered_headers)
|
|
log_headers = (filtered_headers + ["Valid Duration?", "Valid Progress?", "Valid Grade?",
|
|
"Valid Standard Deviation?"]).flatten
|
|
clean_csv << filtered_headers
|
|
log_csv << log_headers
|
|
|
|
all_survey_items = survey_items(headers:)
|
|
|
|
file.lazy.each_slice(1000) do |lines|
|
|
CSV.parse(lines.join, headers:).map do |row|
|
|
values = SurveyItemValues.new(row:, headers:,
|
|
survey_items: all_survey_items, schools:, academic_years:)
|
|
next unless values.valid_school?
|
|
|
|
data << values
|
|
values.valid? ? clean_csv << values.to_a : log_csv << (values.to_a << values.valid_duration?.to_s << values.valid_progress?.to_s << values.valid_grade?.to_s << values.valid_sd?.to_s)
|
|
end
|
|
end
|
|
[headers, clean_csv, log_csv, data]
|
|
end
|
|
|
|
private
|
|
|
|
def academic_years
|
|
@academic_years ||= AcademicYear.all
|
|
end
|
|
|
|
def include_all_headers(headers:)
|
|
alternates = headers.filter(&:present?)
|
|
.filter { |header| header.match?(/^[st]-\w*-\w*-1$/i) }
|
|
alternates.each do |header|
|
|
main = header.sub(/-1\z/, "")
|
|
headers.push(main) unless headers.include?(main)
|
|
end
|
|
headers
|
|
end
|
|
|
|
def initialize_directories
|
|
create_ouput_directory
|
|
create_log_directory
|
|
end
|
|
|
|
def remove_unwanted_headers(headers:)
|
|
headers.to_set.to_a.compact.reject do |item|
|
|
item.start_with? "Q"
|
|
end.reject { |header| header.match?(/^[st]-\w*-\w*-1$/i) }
|
|
end
|
|
|
|
def write_csv(data:, output_filepath:, filename:, prefix: "")
|
|
csv = CSV.generate do |csv|
|
|
data.each do |row|
|
|
csv << row
|
|
end
|
|
end
|
|
File.write(output_filepath.join(prefix + filename), csv)
|
|
end
|
|
|
|
def schools
|
|
@schools ||= School.by_dese_id
|
|
end
|
|
|
|
def genders
|
|
@genders ||= Gender.by_qualtrics_code
|
|
end
|
|
|
|
def survey_items(headers:)
|
|
survey_item_ids = headers
|
|
.filter(&:present?)
|
|
.filter { |header| header.start_with? "t-", "s-" }
|
|
@survey_items ||= SurveyItem.where(survey_item_id: survey_item_ids)
|
|
end
|
|
|
|
def create_ouput_directory
|
|
FileUtils.mkdir_p output_filepath
|
|
end
|
|
|
|
def create_log_directory
|
|
FileUtils.mkdir_p log_filepath
|
|
end
|
|
end
|