mirror of
https://github.com/edcommonwealth/sqm-dashboards.git
synced 2026-03-09 15:38:21 -07:00
instead of hard coded values. Produce a clean csv and a csv with all the removed values and columns with reason for removal. Add script for running cleaning for each project
123 lines
3.5 KiB
Ruby
123 lines
3.5 KiB
Ruby
require 'fileutils'
|
|
class Cleaner
|
|
attr_reader :input_filepath, :output_filepath, :log_filepath, :clean_csv, :log_csv
|
|
|
|
def initialize(input_filepath:, output_filepath:, log_filepath:)
|
|
@input_filepath = input_filepath
|
|
@output_filepath = output_filepath
|
|
@log_filepath = log_filepath
|
|
initialize_directories
|
|
end
|
|
|
|
def initialize_directories
|
|
create_ouput_directory
|
|
create_log_directory
|
|
end
|
|
|
|
def clean
|
|
Dir.glob(Rails.root.join(input_filepath, '*.csv')).each do |filepath|
|
|
puts filepath
|
|
File.open(filepath) do |file|
|
|
clean_csv = []
|
|
log_csv = []
|
|
data = []
|
|
|
|
headers = CSV.parse(file.first).first
|
|
filtered_headers = remove_unwanted_headers(headers:)
|
|
log_headers = (filtered_headers + ['Valid Duration?', 'Valid Progress?', 'Valid Grade?',
|
|
'Valid Standard Deviation?']).flatten
|
|
|
|
clean_csv << filtered_headers
|
|
log_csv << log_headers
|
|
|
|
all_survey_items = survey_items(headers:)
|
|
|
|
file.lazy.each_slice(1000) do |lines|
|
|
CSV.parse(lines.join, headers:).map do |row|
|
|
values = SurveyItemValues.new(row:, headers:, genders:,
|
|
survey_items: all_survey_items, schools:)
|
|
next unless values.valid_school?
|
|
|
|
data << values
|
|
values.valid? ? clean_csv << values.to_a : log_csv << (values.to_a << values.valid_duration?.to_s << values.valid_progress?.to_s << values.valid_grade?.to_s << values.valid_sd?.to_s)
|
|
end
|
|
end
|
|
|
|
filename = filename(headers:, data:)
|
|
write_csv(data: clean_csv, output_filepath:, filename:)
|
|
write_csv(data: log_csv, output_filepath: log_filepath, prefix: 'removed.', filename:)
|
|
end
|
|
end
|
|
end
|
|
|
|
def filename(headers:, data:)
|
|
survey_item_ids = headers.filter(&:present?).filter do |header|
|
|
header.start_with?('s-', 't-')
|
|
end.reject { |item| item.end_with? '-1' }
|
|
survey_type = SurveyItem.survey_type(survey_item_ids:)
|
|
range = data.first.academic_year.range
|
|
|
|
districts = data.map do |row|
|
|
row.district.name
|
|
end.to_set.to_a
|
|
|
|
districts.join('.').to_s + '.' + survey_type.to_s + '.' + range + '.csv'
|
|
end
|
|
|
|
def remove_unwanted_headers(headers:)
|
|
headers.to_set.to_a.compact.reject do |item|
|
|
item.start_with? 'Q'
|
|
end.reject { |item| item.end_with? '-1' }
|
|
end
|
|
|
|
def write_csv(data:, output_filepath:, filename:, prefix: '')
|
|
csv = CSV.generate do |csv|
|
|
data.each do |row|
|
|
csv << row
|
|
end
|
|
end
|
|
File.write(output_filepath.join(prefix + filename), csv)
|
|
end
|
|
|
|
def process_row(row:)
|
|
clean_csv << row.to_csv
|
|
log_csv << row.to_csv
|
|
end
|
|
|
|
def schools
|
|
@schools ||= School.school_hash
|
|
end
|
|
|
|
def genders
|
|
@genders ||= begin
|
|
gender_hash = {}
|
|
|
|
Gender.all.each do |gender|
|
|
gender_hash[gender.qualtrics_code] = gender
|
|
end
|
|
gender_hash
|
|
end
|
|
end
|
|
|
|
def survey_items(headers:)
|
|
@survey_items ||= SurveyItem.where(survey_item_id: get_survey_item_ids_from_headers(headers:))
|
|
end
|
|
|
|
def get_survey_item_ids_from_headers(headers:)
|
|
headers
|
|
.filter(&:present?)
|
|
.filter { |header| header.start_with? 't-', 's-' }
|
|
end
|
|
|
|
def create_ouput_directory
|
|
FileUtils.mkdir_p output_filepath
|
|
end
|
|
|
|
def create_log_directory
|
|
FileUtils.mkdir_p log_filepath
|
|
end
|
|
|
|
def create_file(path:, filename:)
|
|
FileUtils.touch path.join(filename)
|
|
end
|
|
end
|