You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
sqm-dashboards/app/services/cleaner.rb

167 lines
5.5 KiB

require "fileutils"
class Cleaner
attr_reader :input_filepath, :output_filepath, :log_filepath
def initialize(input_filepath:, output_filepath:, log_filepath:)
@input_filepath = input_filepath
@output_filepath = output_filepath
@log_filepath = log_filepath
initialize_directories
end
def clean
Dir.glob(Rails.root.join(input_filepath, "*.csv")).each do |filepath|
puts filepath
File.open(filepath) do |file|
processed_data = process_raw_file(file:)
processed_data in [headers, clean_csv, log_csv, data]
return if data.empty?
filename = filename(headers:, data:, filepath:)
write_csv(data: clean_csv, output_filepath:, filename:)
write_csv(data: log_csv, output_filepath: log_filepath, prefix: "removed.", filename:)
end
end
end
def filename(headers:, data:, filepath:)
output = []
survey_item_ids = headers.filter(&:present?).filter do |header|
header.start_with?("s-", "t-", "p-")
end.reject { |item| item.end_with? "-1" }
survey_type = SurveyItem.survey_type(survey_item_ids:)
range = data.first.academic_year.range
districts = data.map do |row|
row.district.short_name
end.to_set.to_a
schools = data.map do |row|
row.school.name
end.to_set
part = filepath&.match(/[\b\s_.]+(part)[\W*_](?<label>[\w\d])/i)&.named_captures&.[]("label")&.upcase
school_name = schools.first.parameterize
output << districts.sort.join(".")
output << school_name if schools.length == 1
output << survey_type.to_s
output << "Part-" + part unless part.nil?
output << range.parameterize
output << "csv"
output.join(".")
end
def process_raw_file(file:)
clean_csv = []
log_csv = []
data = []
headers = CSV.parse(file.first).first
# If this is a student survey
# Make sure it includes a 'Grade' header
is_student_survey = headers.filter(&:present?)
.filter { |header| header.start_with? "s-" }
.count > 0
has_grade_header = headers.filter(&:present?).find { |header| header.match?(/grade/i) }.present?
if is_student_survey && has_grade_header == false
puts "could not find the Grade header. Stopping execution"
exit
end
duplicate_header = headers.detect { |header| headers.count(header) > 1 }
unless duplicate_header.nil?
puts "\n>>>>>>>>>>>>>>>>>> Duplicate header found. This will misalign column headings. Please delete or rename the duplicate column: #{duplicate_header} \n>>>>>>>>>>>>>> \n"
exit
end
headers = headers.to_set
headers = headers.merge(Set.new(["Raw Income", "Income", "Raw ELL", "ELL", "Raw SpEd", "SpEd", "Progress Count",
"Race", "Gender", "Raw Housing Status", "Housing Status", "Home Language", "Home Languages", "Declared Races of Children from Parents", "Declared Genders of Children from Parents"])).to_a
filtered_headers = include_all_headers(headers:)
filtered_headers = remove_unwanted_headers(headers: filtered_headers)
log_headers = (filtered_headers + ["Valid Duration?", "Valid Progress?", "Valid Grade?",
"Valid Standard Deviation?"]).flatten
clean_csv << filtered_headers
log_csv << log_headers
all_survey_items = survey_items(headers:)
file.lazy.each_slice(1000) do |lines|
CSV.parse(lines.join, headers:).map do |row|
values = SurveyItemValues.new(row:, headers:,
survey_items: all_survey_items, schools:, academic_years:)
unless values.valid_school?
# puts "row #{values.response_id}: dese id :#{values.dese_id} : could not find school, skipping row for file #{file.path}"
next
end
data << values
values.valid? ? clean_csv << values.to_a : log_csv << (values.to_a << values.valid_duration?.to_s << values.valid_progress?.to_s << values.valid_grade?.to_s << values.valid_sd?.to_s)
end
end
[headers, clean_csv, log_csv, data]
end
private
def academic_years
@academic_years ||= AcademicYear.all
end
def include_all_headers(headers:)
alternates = headers.filter(&:present?)
.filter { |header| header.match?(/^[st]-\w*-\w*-1$/i) }
alternates.each do |header|
main = header.sub(/-1\z/, "")
headers.push(main) unless headers.include?(main)
end
headers
end
def initialize_directories
create_ouput_directory
create_log_directory
end
def remove_unwanted_headers(headers:)
headers.to_set.to_a.compact.reject do |item|
item.start_with? "Q"
end.reject { |header| header.match?(/^[stp]-\w*-\w*-1$/i) }
end
def write_csv(data:, output_filepath:, filename:, prefix: "")
csv = CSV.generate do |csv|
data.each do |row|
csv << row
end
end
File.write(output_filepath.join(prefix + filename), csv)
end
def schools
@schools ||= School.by_dese_id
end
def genders
@genders ||= Gender.by_qualtrics_code
end
def survey_items(headers:)
survey_item_ids = headers
.filter(&:present?)
.filter { |header| header.start_with? "t-", "s-", "p-" }
@survey_items ||= SurveyItem.where(survey_item_id: survey_item_ids)
end
def create_ouput_directory
FileUtils.mkdir_p output_filepath
end
def create_log_directory
FileUtils.mkdir_p log_filepath
end
end