You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
167 lines
5.5 KiB
167 lines
5.5 KiB
require "fileutils"
|
|
class Cleaner
|
|
attr_reader :input_filepath, :output_filepath, :log_filepath
|
|
|
|
def initialize(input_filepath:, output_filepath:, log_filepath:)
|
|
@input_filepath = input_filepath
|
|
@output_filepath = output_filepath
|
|
@log_filepath = log_filepath
|
|
initialize_directories
|
|
end
|
|
|
|
def clean
|
|
Dir.glob(Rails.root.join(input_filepath, "*.csv")).each do |filepath|
|
|
puts filepath
|
|
File.open(filepath) do |file|
|
|
processed_data = process_raw_file(file:)
|
|
processed_data in [headers, clean_csv, log_csv, data]
|
|
return if data.empty?
|
|
|
|
filename = filename(headers:, data:, filepath:)
|
|
write_csv(data: clean_csv, output_filepath:, filename:)
|
|
write_csv(data: log_csv, output_filepath: log_filepath, prefix: "removed.", filename:)
|
|
end
|
|
end
|
|
end
|
|
|
|
def filename(headers:, data:, filepath:)
|
|
output = []
|
|
survey_item_ids = headers.filter(&:present?).filter do |header|
|
|
header.start_with?("s-", "t-")
|
|
end.reject { |item| item.end_with? "-1" }
|
|
survey_type = SurveyItem.survey_type(survey_item_ids:)
|
|
range = data.first.academic_year.range
|
|
|
|
districts = data.map do |row|
|
|
row.district.short_name
|
|
end.to_set.to_a
|
|
|
|
schools = data.map do |row|
|
|
row.school.name
|
|
end.to_set
|
|
|
|
part = filepath&.match(/[\b\s_.]+(part)[\W*_](?<label>[\w\d])/i)&.named_captures&.[]("label")&.upcase
|
|
|
|
school_name = schools.first.parameterize
|
|
|
|
output << districts.sort.join(".")
|
|
output << school_name if schools.length == 1
|
|
output << survey_type.to_s
|
|
output << "Part-" + part unless part.nil?
|
|
output << range.parameterize
|
|
output << "csv"
|
|
output.join(".")
|
|
end
|
|
|
|
def process_raw_file(file:)
|
|
clean_csv = []
|
|
log_csv = []
|
|
data = []
|
|
headers = CSV.parse(file.first).first
|
|
|
|
# If this is a student survey
|
|
# Make sure it includes a 'Grade' header
|
|
student_survey_is_missing_grade_header = headers
|
|
.filter(&:present?)
|
|
.filter { |header| header.start_with? "s-" }.count > 0 && !headers.find do |header|
|
|
header.match?(/grade/i)
|
|
end
|
|
if student_survey_is_missing_grade_header
|
|
puts "could not find the Grade header. Stopping execution"
|
|
exit
|
|
end
|
|
|
|
duplicate_header = headers.detect { |header| headers.count(header) > 1 }
|
|
unless duplicate_header.nil?
|
|
puts "\n>>>>>>>>>>>>>>>>>> Duplicate header found. This will misalign column headings. Please delete or rename the duplicate column: #{duplicate_header} \n>>>>>>>>>>>>>> \n"
|
|
exit
|
|
end
|
|
|
|
headers = headers.to_set
|
|
headers = headers.merge(Set.new(["Raw Income", "Income", "Raw ELL", "ELL", "Raw SpEd", "SpEd", "Progress Count",
|
|
"Race", "Gender"])).to_a
|
|
filtered_headers = include_all_headers(headers:)
|
|
filtered_headers = remove_unwanted_headers(headers: filtered_headers)
|
|
log_headers = (filtered_headers + ["Valid Duration?", "Valid Progress?", "Valid Grade?",
|
|
"Valid Standard Deviation?"]).flatten
|
|
clean_csv << filtered_headers
|
|
log_csv << log_headers
|
|
|
|
all_survey_items = survey_items(headers:)
|
|
|
|
file.lazy.each_slice(1000) do |lines|
|
|
CSV.parse(lines.join, headers:).map do |row|
|
|
values = SurveyItemValues.new(row:, headers:,
|
|
survey_items: all_survey_items, schools:, academic_years:)
|
|
unless values.valid_school?
|
|
# puts "row #{values.response_id}: dese id :#{values.dese_id} : could not find school, skipping row for file #{file.path}"
|
|
next
|
|
end
|
|
|
|
data << values
|
|
values.valid? ? clean_csv << values.to_a : log_csv << (values.to_a << values.valid_duration?.to_s << values.valid_progress?.to_s << values.valid_grade?.to_s << values.valid_sd?.to_s)
|
|
end
|
|
end
|
|
[headers, clean_csv, log_csv, data]
|
|
end
|
|
|
|
private
|
|
|
|
def academic_years
|
|
@academic_years ||= AcademicYear.all
|
|
end
|
|
|
|
def include_all_headers(headers:)
|
|
alternates = headers.filter(&:present?)
|
|
.filter { |header| header.match?(/^[st]-\w*-\w*-1$/i) }
|
|
alternates.each do |header|
|
|
main = header.sub(/-1\z/, "")
|
|
headers.push(main) unless headers.include?(main)
|
|
end
|
|
headers
|
|
end
|
|
|
|
def initialize_directories
|
|
create_ouput_directory
|
|
create_log_directory
|
|
end
|
|
|
|
def remove_unwanted_headers(headers:)
|
|
headers.to_set.to_a.compact.reject do |item|
|
|
item.start_with? "Q"
|
|
end.reject { |header| header.match?(/^[st]-\w*-\w*-1$/i) }
|
|
end
|
|
|
|
def write_csv(data:, output_filepath:, filename:, prefix: "")
|
|
csv = CSV.generate do |csv|
|
|
data.each do |row|
|
|
csv << row
|
|
end
|
|
end
|
|
File.write(output_filepath.join(prefix + filename), csv)
|
|
end
|
|
|
|
def schools
|
|
@schools ||= School.by_dese_id
|
|
end
|
|
|
|
def genders
|
|
@genders ||= Gender.by_qualtrics_code
|
|
end
|
|
|
|
def survey_items(headers:)
|
|
survey_item_ids = headers
|
|
.filter(&:present?)
|
|
.filter { |header| header.start_with? "t-", "s-" }
|
|
@survey_items ||= SurveyItem.where(survey_item_id: survey_item_ids)
|
|
end
|
|
|
|
def create_ouput_directory
|
|
FileUtils.mkdir_p output_filepath
|
|
end
|
|
|
|
def create_log_directory
|
|
FileUtils.mkdir_p log_filepath
|
|
end
|
|
end
|