mirror of
https://github.com/edcommonwealth/sqm-dashboards.git
synced 2026-03-07 21:48:16 -08:00
feat: add ability to merge disaggregation data with raw survey data to
produce a cleaned csv with merged income disaggregation columns
This commit is contained in:
parent
fae530d21f
commit
0a2c5e02c5
13 changed files with 1203 additions and 229 deletions
|
|
@ -1,61 +1,38 @@
|
|||
require "fileutils"
|
||||
require 'fileutils'
|
||||
class Cleaner
|
||||
attr_reader :input_filepath, :output_filepath, :log_filepath, :clean_csv, :log_csv
|
||||
attr_reader :input_filepath, :output_filepath, :log_filepath, :disaggregation_filepath
|
||||
|
||||
def initialize(input_filepath:, output_filepath:, log_filepath:)
|
||||
def initialize(input_filepath:, output_filepath:, log_filepath:, disaggregation_filepath:)
|
||||
@input_filepath = input_filepath
|
||||
@output_filepath = output_filepath
|
||||
@log_filepath = log_filepath
|
||||
@disaggregation_filepath = disaggregation_filepath
|
||||
initialize_directories
|
||||
end
|
||||
|
||||
def initialize_directories
|
||||
create_ouput_directory
|
||||
create_log_directory
|
||||
end
|
||||
|
||||
def clean
|
||||
Dir.glob(Rails.root.join(input_filepath, "*.csv")).each do |filepath|
|
||||
Dir.glob(Rails.root.join(input_filepath, '*.csv')).each do |filepath|
|
||||
puts filepath
|
||||
File.open(filepath) do |file|
|
||||
clean_csv = []
|
||||
log_csv = []
|
||||
data = []
|
||||
processed_data = process_raw_file(file:, disaggregation_data:)
|
||||
processed_data in [headers, clean_csv, log_csv, data]
|
||||
return if data.empty?
|
||||
|
||||
headers = CSV.parse(file.first).first
|
||||
filtered_headers = remove_unwanted_headers(headers:)
|
||||
log_headers = (filtered_headers + ["Valid Duration?", "Valid Progress?", "Valid Grade?",
|
||||
"Valid Standard Deviation?"]).flatten
|
||||
|
||||
clean_csv << filtered_headers
|
||||
log_csv << log_headers
|
||||
|
||||
all_survey_items = survey_items(headers:)
|
||||
|
||||
file.lazy.each_slice(1000) do |lines|
|
||||
CSV.parse(lines.join, headers:).map do |row|
|
||||
values = SurveyItemValues.new(row:, headers:, genders:,
|
||||
survey_items: all_survey_items, schools:)
|
||||
next unless values.valid_school?
|
||||
|
||||
data << values
|
||||
values.valid? ? clean_csv << values.to_a : log_csv << (values.to_a << values.valid_duration?.to_s << values.valid_progress?.to_s << values.valid_grade?.to_s << values.valid_sd?.to_s)
|
||||
end
|
||||
end
|
||||
|
||||
unless data.empty?
|
||||
filename = filename(headers:, data:)
|
||||
write_csv(data: clean_csv, output_filepath:, filename:)
|
||||
write_csv(data: log_csv, output_filepath: log_filepath, prefix: "removed.", filename:)
|
||||
end
|
||||
filename = filename(headers:, data:)
|
||||
write_csv(data: clean_csv, output_filepath:, filename:)
|
||||
write_csv(data: log_csv, output_filepath: log_filepath, prefix: 'removed.', filename:)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def disaggregation_data
|
||||
@disaggregation_data ||= DisaggregationLoader.new(path: disaggregation_filepath).load
|
||||
end
|
||||
|
||||
def filename(headers:, data:)
|
||||
survey_item_ids = headers.filter(&:present?).filter do |header|
|
||||
header.start_with?("s-", "t-")
|
||||
end.reject { |item| item.end_with? "-1" }
|
||||
header.start_with?('s-', 't-')
|
||||
end.reject { |item| item.end_with? '-1' }
|
||||
survey_type = SurveyItem.survey_type(survey_item_ids:)
|
||||
range = data.first.academic_year.range
|
||||
|
||||
|
|
@ -63,16 +40,51 @@ class Cleaner
|
|||
row.district.short_name
|
||||
end.to_set.to_a
|
||||
|
||||
districts.join(".").to_s + "." + survey_type.to_s + "." + range + ".csv"
|
||||
districts.join('.').to_s + '.' + survey_type.to_s + '.' + range + '.csv'
|
||||
end
|
||||
|
||||
def process_raw_file(file:, disaggregation_data:)
|
||||
clean_csv = []
|
||||
log_csv = []
|
||||
data = []
|
||||
|
||||
headers = (CSV.parse(file.first).first << 'Raw Income') << 'Income'
|
||||
filtered_headers = remove_unwanted_headers(headers:)
|
||||
log_headers = (filtered_headers + ['Valid Duration?', 'Valid Progress?', 'Valid Grade?',
|
||||
'Valid Standard Deviation?']).flatten
|
||||
|
||||
clean_csv << filtered_headers
|
||||
log_csv << log_headers
|
||||
|
||||
all_survey_items = survey_items(headers:)
|
||||
|
||||
file.lazy.each_slice(1000) do |lines|
|
||||
CSV.parse(lines.join, headers:).map do |row|
|
||||
values = SurveyItemValues.new(row:, headers:, genders:,
|
||||
survey_items: all_survey_items, schools:, disaggregation_data:)
|
||||
next unless values.valid_school?
|
||||
|
||||
data << values
|
||||
values.valid? ? clean_csv << values.to_a : log_csv << (values.to_a << values.valid_duration?.to_s << values.valid_progress?.to_s << values.valid_grade?.to_s << values.valid_sd?.to_s)
|
||||
end
|
||||
end
|
||||
[headers, clean_csv, log_csv, data]
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def initialize_directories
|
||||
create_ouput_directory
|
||||
create_log_directory
|
||||
end
|
||||
|
||||
def remove_unwanted_headers(headers:)
|
||||
headers.to_set.to_a.compact.reject do |item|
|
||||
item.start_with? "Q"
|
||||
end.reject { |item| item.end_with? "-1" }
|
||||
item.start_with? 'Q'
|
||||
end.reject { |item| item.end_with? '-1' }
|
||||
end
|
||||
|
||||
def write_csv(data:, output_filepath:, filename:, prefix: "")
|
||||
def write_csv(data:, output_filepath:, filename:, prefix: '')
|
||||
csv = CSV.generate do |csv|
|
||||
data.each do |row|
|
||||
csv << row
|
||||
|
|
@ -81,34 +93,19 @@ class Cleaner
|
|||
File.write(output_filepath.join(prefix + filename), csv)
|
||||
end
|
||||
|
||||
def process_row(row:)
|
||||
clean_csv << row.to_csv
|
||||
log_csv << row.to_csv
|
||||
end
|
||||
|
||||
def schools
|
||||
@schools ||= School.school_hash
|
||||
end
|
||||
|
||||
def genders
|
||||
@genders ||= begin
|
||||
gender_hash = {}
|
||||
|
||||
Gender.all.each do |gender|
|
||||
gender_hash[gender.qualtrics_code] = gender
|
||||
end
|
||||
gender_hash
|
||||
end
|
||||
@genders ||= Gender.gender_hash
|
||||
end
|
||||
|
||||
def survey_items(headers:)
|
||||
@survey_items ||= SurveyItem.where(survey_item_id: get_survey_item_ids_from_headers(headers:))
|
||||
end
|
||||
|
||||
def get_survey_item_ids_from_headers(headers:)
|
||||
headers
|
||||
.filter(&:present?)
|
||||
.filter { |header| header.start_with? "t-", "s-" }
|
||||
survey_item_ids = headers
|
||||
.filter(&:present?)
|
||||
.filter { |header| header.start_with? 't-', 's-' }
|
||||
@survey_items ||= SurveyItem.where(survey_item_id: survey_item_ids)
|
||||
end
|
||||
|
||||
def create_ouput_directory
|
||||
|
|
@ -118,8 +115,4 @@ class Cleaner
|
|||
def create_log_directory
|
||||
FileUtils.mkdir_p log_filepath
|
||||
end
|
||||
|
||||
def create_file(path:, filename:)
|
||||
FileUtils.touch path.join(filename)
|
||||
end
|
||||
end
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue