sqm-dashboards/app/services/cleaner.rb
rebuilt 0f457becf0 feat: create a parents by language graph
Update demographics table with lanugage options

Create a lanugage table to hold the new languages

Update the demographic loader to input languages into the database

Update the cleaner to read the language column

Update the parent table to hold a reference to a language

Update the data uploader script to read the language from the csv and update the language information for any parent items that already exist (or create database entries if none already exist)

update the analyze interface to add controls for selecting ‘parents by group’ and a dropdown for ‘parent by language’

Update the analyze controller to read the parent-by-group parameter

Create a graph for the parent-by-group view

Bubble up averages for language calculations.

Make sure n-size only counts responses for a given measure.
2025-04-28 16:42:11 -07:00

166 lines
5.4 KiB
Ruby

require "fileutils"
class Cleaner
attr_reader :input_filepath, :output_filepath, :log_filepath
def initialize(input_filepath:, output_filepath:, log_filepath:)
@input_filepath = input_filepath
@output_filepath = output_filepath
@log_filepath = log_filepath
initialize_directories
end
def clean
Dir.glob(Rails.root.join(input_filepath, "*.csv")).each do |filepath|
puts filepath
File.open(filepath) do |file|
processed_data = process_raw_file(file:)
processed_data in [headers, clean_csv, log_csv, data]
return if data.empty?
filename = filename(headers:, data:, filepath:)
write_csv(data: clean_csv, output_filepath:, filename:)
write_csv(data: log_csv, output_filepath: log_filepath, prefix: "removed.", filename:)
end
end
end
def filename(headers:, data:, filepath:)
output = []
survey_item_ids = headers.filter(&:present?).filter do |header|
header.start_with?("s-", "t-", "p-")
end.reject { |item| item.end_with? "-1" }
survey_type = SurveyItem.survey_type(survey_item_ids:)
range = data.first.academic_year.range
districts = data.map do |row|
row.district.short_name
end.to_set.to_a
schools = data.map do |row|
row.school.name
end.to_set
part = filepath&.match(/[\b\s_.]+(part)[\W*_](?<label>[\w\d])/i)&.named_captures&.[]("label")&.upcase
school_name = schools.first.parameterize
output << districts.sort.join(".")
output << school_name if schools.length == 1
output << survey_type.to_s
output << "Part-" + part unless part.nil?
output << range.parameterize
output << "csv"
output.join(".")
end
def process_raw_file(file:)
clean_csv = []
log_csv = []
data = []
headers = CSV.parse(file.first).first
# If this is a student survey
# Make sure it includes a 'Grade' header
is_student_survey = headers.filter(&:present?)
.filter { |header| header.start_with? "s-" }
.count > 0
has_grade_header = headers.filter(&:present?).find { |header| header.match?(/grade/i) }.present?
if is_student_survey && has_grade_header == false
puts "could not find the Grade header. Stopping execution"
exit
end
duplicate_header = headers.detect { |header| headers.count(header) > 1 }
unless duplicate_header.nil?
puts "\n>>>>>>>>>>>>>>>>>> Duplicate header found. This will misalign column headings. Please delete or rename the duplicate column: #{duplicate_header} \n>>>>>>>>>>>>>> \n"
exit
end
headers = headers.to_set
headers = headers.merge(Set.new(["Raw Income", "Income", "Raw ELL", "ELL", "Raw SpEd", "SpEd", "Progress Count",
"Race", "Gender", "Raw Housing Status", "Housing Status", "Home Language", "Home Languages"])).to_a
filtered_headers = include_all_headers(headers:)
filtered_headers = remove_unwanted_headers(headers: filtered_headers)
log_headers = (filtered_headers + ["Valid Duration?", "Valid Progress?", "Valid Grade?",
"Valid Standard Deviation?"]).flatten
clean_csv << filtered_headers
log_csv << log_headers
all_survey_items = survey_items(headers:)
file.lazy.each_slice(1000) do |lines|
CSV.parse(lines.join, headers:).map do |row|
values = SurveyItemValues.new(row:, headers:,
survey_items: all_survey_items, schools:, academic_years:)
unless values.valid_school?
# puts "row #{values.response_id}: dese id :#{values.dese_id} : could not find school, skipping row for file #{file.path}"
next
end
data << values
values.valid? ? clean_csv << values.to_a : log_csv << (values.to_a << values.valid_duration?.to_s << values.valid_progress?.to_s << values.valid_grade?.to_s << values.valid_sd?.to_s)
end
end
[headers, clean_csv, log_csv, data]
end
private
def academic_years
@academic_years ||= AcademicYear.all
end
def include_all_headers(headers:)
alternates = headers.filter(&:present?)
.filter { |header| header.match?(/^[st]-\w*-\w*-1$/i) }
alternates.each do |header|
main = header.sub(/-1\z/, "")
headers.push(main) unless headers.include?(main)
end
headers
end
def initialize_directories
create_ouput_directory
create_log_directory
end
def remove_unwanted_headers(headers:)
headers.to_set.to_a.compact.reject do |item|
item.start_with? "Q"
end.reject { |header| header.match?(/^[stp]-\w*-\w*-1$/i) }
end
def write_csv(data:, output_filepath:, filename:, prefix: "")
csv = CSV.generate do |csv|
data.each do |row|
csv << row
end
end
File.write(output_filepath.join(prefix + filename), csv)
end
def schools
@schools ||= School.by_dese_id
end
def genders
@genders ||= Gender.by_qualtrics_code
end
def survey_items(headers:)
survey_item_ids = headers
.filter(&:present?)
.filter { |header| header.start_with? "t-", "s-", "p-" }
@survey_items ||= SurveyItem.where(survey_item_id: survey_item_ids)
end
def create_ouput_directory
FileUtils.mkdir_p output_filepath
end
def create_log_directory
FileUtils.mkdir_p log_filepath
end
end