feat: add ability to merge disaggregation data with raw survey data to

produce a cleaned csv with merged income disaggregation columns
This commit is contained in:
Nelson Jovel 2023-06-13 17:41:32 -07:00 committed by rebuilt
parent fae530d21f
commit 0a2c5e02c5
13 changed files with 1203 additions and 229 deletions

View file

@ -1,61 +1,38 @@
require "fileutils"
require 'fileutils'
class Cleaner
attr_reader :input_filepath, :output_filepath, :log_filepath, :clean_csv, :log_csv
attr_reader :input_filepath, :output_filepath, :log_filepath, :disaggregation_filepath
def initialize(input_filepath:, output_filepath:, log_filepath:)
def initialize(input_filepath:, output_filepath:, log_filepath:, disaggregation_filepath:)
@input_filepath = input_filepath
@output_filepath = output_filepath
@log_filepath = log_filepath
@disaggregation_filepath = disaggregation_filepath
initialize_directories
end
def initialize_directories
create_ouput_directory
create_log_directory
end
def clean
Dir.glob(Rails.root.join(input_filepath, "*.csv")).each do |filepath|
Dir.glob(Rails.root.join(input_filepath, '*.csv')).each do |filepath|
puts filepath
File.open(filepath) do |file|
clean_csv = []
log_csv = []
data = []
processed_data = process_raw_file(file:, disaggregation_data:)
processed_data in [headers, clean_csv, log_csv, data]
return if data.empty?
headers = CSV.parse(file.first).first
filtered_headers = remove_unwanted_headers(headers:)
log_headers = (filtered_headers + ["Valid Duration?", "Valid Progress?", "Valid Grade?",
"Valid Standard Deviation?"]).flatten
clean_csv << filtered_headers
log_csv << log_headers
all_survey_items = survey_items(headers:)
file.lazy.each_slice(1000) do |lines|
CSV.parse(lines.join, headers:).map do |row|
values = SurveyItemValues.new(row:, headers:, genders:,
survey_items: all_survey_items, schools:)
next unless values.valid_school?
data << values
values.valid? ? clean_csv << values.to_a : log_csv << (values.to_a << values.valid_duration?.to_s << values.valid_progress?.to_s << values.valid_grade?.to_s << values.valid_sd?.to_s)
end
end
unless data.empty?
filename = filename(headers:, data:)
write_csv(data: clean_csv, output_filepath:, filename:)
write_csv(data: log_csv, output_filepath: log_filepath, prefix: "removed.", filename:)
end
filename = filename(headers:, data:)
write_csv(data: clean_csv, output_filepath:, filename:)
write_csv(data: log_csv, output_filepath: log_filepath, prefix: 'removed.', filename:)
end
end
end
def disaggregation_data
@disaggregation_data ||= DisaggregationLoader.new(path: disaggregation_filepath).load
end
def filename(headers:, data:)
survey_item_ids = headers.filter(&:present?).filter do |header|
header.start_with?("s-", "t-")
end.reject { |item| item.end_with? "-1" }
header.start_with?('s-', 't-')
end.reject { |item| item.end_with? '-1' }
survey_type = SurveyItem.survey_type(survey_item_ids:)
range = data.first.academic_year.range
@ -63,16 +40,51 @@ class Cleaner
row.district.short_name
end.to_set.to_a
districts.join(".").to_s + "." + survey_type.to_s + "." + range + ".csv"
districts.join('.').to_s + '.' + survey_type.to_s + '.' + range + '.csv'
end
def process_raw_file(file:, disaggregation_data:)
clean_csv = []
log_csv = []
data = []
headers = (CSV.parse(file.first).first << 'Raw Income') << 'Income'
filtered_headers = remove_unwanted_headers(headers:)
log_headers = (filtered_headers + ['Valid Duration?', 'Valid Progress?', 'Valid Grade?',
'Valid Standard Deviation?']).flatten
clean_csv << filtered_headers
log_csv << log_headers
all_survey_items = survey_items(headers:)
file.lazy.each_slice(1000) do |lines|
CSV.parse(lines.join, headers:).map do |row|
values = SurveyItemValues.new(row:, headers:, genders:,
survey_items: all_survey_items, schools:, disaggregation_data:)
next unless values.valid_school?
data << values
values.valid? ? clean_csv << values.to_a : log_csv << (values.to_a << values.valid_duration?.to_s << values.valid_progress?.to_s << values.valid_grade?.to_s << values.valid_sd?.to_s)
end
end
[headers, clean_csv, log_csv, data]
end
private
def initialize_directories
create_ouput_directory
create_log_directory
end
def remove_unwanted_headers(headers:)
headers.to_set.to_a.compact.reject do |item|
item.start_with? "Q"
end.reject { |item| item.end_with? "-1" }
item.start_with? 'Q'
end.reject { |item| item.end_with? '-1' }
end
def write_csv(data:, output_filepath:, filename:, prefix: "")
def write_csv(data:, output_filepath:, filename:, prefix: '')
csv = CSV.generate do |csv|
data.each do |row|
csv << row
@ -81,34 +93,19 @@ class Cleaner
File.write(output_filepath.join(prefix + filename), csv)
end
def process_row(row:)
clean_csv << row.to_csv
log_csv << row.to_csv
end
def schools
@schools ||= School.school_hash
end
def genders
@genders ||= begin
gender_hash = {}
Gender.all.each do |gender|
gender_hash[gender.qualtrics_code] = gender
end
gender_hash
end
@genders ||= Gender.gender_hash
end
def survey_items(headers:)
@survey_items ||= SurveyItem.where(survey_item_id: get_survey_item_ids_from_headers(headers:))
end
def get_survey_item_ids_from_headers(headers:)
headers
.filter(&:present?)
.filter { |header| header.start_with? "t-", "s-" }
survey_item_ids = headers
.filter(&:present?)
.filter { |header| header.start_with? 't-', 's-' }
@survey_items ||= SurveyItem.where(survey_item_id: survey_item_ids)
end
def create_ouput_directory
@ -118,8 +115,4 @@ class Cleaner
def create_log_directory
FileUtils.mkdir_p log_filepath
end
def create_file(path:, filename:)
FileUtils.touch path.join(filename)
end
end

View file

@ -0,0 +1,30 @@
class DisaggregationLoader
attr_reader :path
def initialize(path:)
@path = path
initialize_directory
end
def load
data = {}
Dir.glob(Rails.root.join(path, '*.csv')).each do |filepath|
puts filepath
File.open(filepath) do |file|
headers = CSV.parse(file.first).first
file.lazy.each_slice(1000) do |lines|
CSV.parse(lines.join, headers:).map do |row|
values = DisaggregationRow.new(row:, headers:)
data[[values.lasid, values.district, values.academic_year]] = values
end
end
end
end
data
end
def initialize_directory
FileUtils.mkdir_p(path)
end
end

View file

@ -0,0 +1,35 @@
class DisaggregationRow
attr_reader :row, :headers
def initialize(row:, headers:)
@row = row
@headers = headers
end
def district
@district ||= value_from(pattern: /District/i)
end
def academic_year
@academic_year ||= value_from(pattern: /Academic\s*Year/i)
end
def income
@income ||= value_from(pattern: /Low\s*Income/i)
end
def lasid
@lasid ||= value_from(pattern: /LASID/i)
end
def value_from(pattern:)
output = nil
matches = headers.select do |header|
pattern.match(header)
end.map { |item| item.delete("\n") }
matches.each do |match|
output ||= row[match]
end
output
end
end

View file

@ -1,12 +1,13 @@
class SurveyItemValues
attr_reader :row, :headers, :genders, :survey_items, :schools
attr_reader :row, :headers, :genders, :survey_items, :schools, :disaggregation_data
def initialize(row:, headers:, genders:, survey_items:, schools:)
def initialize(row:, headers:, genders:, survey_items:, schools:, disaggregation_data: nil)
@row = row
@headers = headers
@genders = genders
@survey_items = survey_items
@schools = schools
@disaggregation_data = disaggregation_data
end
def dese_id?
@ -51,7 +52,7 @@ class SurveyItemValues
def dese_id
@dese_id ||= begin
dese_id = nil
dese_headers = ["DESE ID", "Dese ID", "DeseId", "DeseID", "School", "school"]
dese_headers = ['DESE ID', 'Dese ID', 'DeseId', 'DeseID', 'School', 'school']
school_headers = headers.select { |header| /School-\s\w/.match(header) }
dese_headers << school_headers
dese_headers.flatten.each do |header|
@ -93,6 +94,31 @@ class SurveyItemValues
genders[gender_code]
end
def lasid
@lasid ||= value_from(pattern: /LASID/i)
end
def raw_income
@raw_income ||= value_from(pattern: /Income|Low\s*Income/i)
return 'Unknown' unless disaggregation_data.present?
disaggregation = disaggregation_data[[lasid, district.name, academic_year.range]]
return 'Unknown' unless disaggregation.present?
@raw_income ||= disaggregation.income
end
def income
@income ||= case raw_income
in /Free\s*Lunch|Reduced\s*Lunch|Low\s*Income/i
'Economically Disadvantaged - Y'
in /Not\s*Eligible/i
'Economically Disadvantaged - N'
else
'Unknown'
end
end
def value_from(pattern:)
output = nil
matches = headers.select do |header|
@ -106,10 +132,12 @@ class SurveyItemValues
def to_a
copy_likert_scores_from_variant_survey_items
row['Income'] = income
row['Raw Income'] = raw_income
headers.select(&:present?)
.reject { |key, _value| key.start_with? "Q" }
.reject { |key, _value| key.end_with? "-1" }
.map { |header| row[header] }
.reject { |key, _value| key.start_with? 'Q' }
.reject { |key, _value| key.end_with? '-1' }
.map { |header| row[header] }
end
def duration
@ -122,23 +150,23 @@ class SurveyItemValues
def respondent_type
return :teacher if headers
.filter(&:present?)
.filter { |header| header.start_with? "t-" }.count > 0
.filter(&:present?)
.filter { |header| header.start_with? 't-' }.count > 0
:student
end
def survey_type
survey_item_ids = headers
.filter(&:present?)
.reject { |header| header.end_with?("-1") }
.filter { |header| header.start_with?("t-", "s-") }
.filter(&:present?)
.reject { |header| header.end_with?('-1') }
.filter { |header| header.start_with?('t-', 's-') }
SurveyItem.survey_type(survey_item_ids:)
end
def valid_duration?
return true if duration.nil? || duration == "" || duration.downcase == "n/a" || duration.downcase == "na"
return true if duration.nil? || duration == '' || duration.downcase == 'n/a' || duration.downcase == 'na'
span_in_seconds = duration.to_i
return span_in_seconds >= 300 if survey_type == :teacher
@ -149,8 +177,8 @@ class SurveyItemValues
end
def valid_progress?
progress = row["Progress"]
return true if progress.nil? || progress == "" || progress.downcase == "n/a" || progress.downcase == "na"
progress = row['Progress']
return true if progress.nil? || progress == '' || progress.downcase == 'n/a' || progress.downcase == 'na'
progress = progress.to_i
progress.to_i >= 25
@ -175,7 +203,7 @@ class SurveyItemValues
def valid_sd?
return true if survey_type == :early_education
survey_item_headers = headers.filter(&:present?).filter { |header| header.start_with?("s-", "t-") }
survey_item_headers = headers.filter(&:present?).filter { |header| header.start_with?('s-', 't-') }
likert_scores = []
survey_item_headers.each do |header|
likert_scores << likert_score(survey_item_id: header).to_i
@ -193,9 +221,9 @@ class SurveyItemValues
private
def copy_likert_scores_from_variant_survey_items
headers.filter(&:present?).filter { |header| header.end_with? "-1" }.each do |header|
headers.filter(&:present?).filter { |header| header.end_with? '-1' }.each do |header|
likert_score = row[header]
main_item = header.gsub("-1", "")
main_item = header.gsub('-1', '')
row[main_item] = likert_score if likert_score.present?
end
end