mirror of
https://github.com/edcommonwealth/sqm-dashboards.git
synced 2026-03-09 07:28:41 -07:00
Add automated data cleaning. Modify SurveyItemValues class to use regex
instead of hard coded values. Produce a clean csv and a csv with all the removed values and columns with reason for removal. Add script for running cleaning for each project
This commit is contained in:
parent
9f33a776b6
commit
dbfc9d1d3a
20 changed files with 1146 additions and 152 deletions
|
|
@ -1,2 +1,5 @@
|
|||
class Gender < ApplicationRecord
|
||||
scope :gender_hash, lambda {
|
||||
all.map { |gender| [gender.qualtrics_code, gender] }.to_h
|
||||
}
|
||||
end
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ class School < ApplicationRecord
|
|||
validates :name, presence: true
|
||||
|
||||
scope :alphabetic, -> { order(name: :asc) }
|
||||
scope :school_hash, -> { all.map { |school| [school.dese_id, school] }.to_h }
|
||||
|
||||
include FriendlyId
|
||||
friendly_id :name, use: [:slugged]
|
||||
|
|
|
|||
|
|
@ -18,13 +18,16 @@ class SurveyItem < ActiveRecord::Base
|
|||
scope :student_survey_items, lambda {
|
||||
where("survey_items.survey_item_id LIKE 's-%'")
|
||||
}
|
||||
scope :standard_survey_items, lambda {
|
||||
where("survey_items.survey_item_id LIKE 's-%-q%'")
|
||||
}
|
||||
scope :teacher_survey_items, lambda {
|
||||
where("survey_items.survey_item_id LIKE 't-%'")
|
||||
}
|
||||
scope :short_form_items, lambda {
|
||||
scope :short_form_survey_items, lambda {
|
||||
where(on_short_form: true)
|
||||
}
|
||||
scope :early_education_surveys, lambda {
|
||||
scope :early_education_survey_items, lambda {
|
||||
where("survey_items.survey_item_id LIKE '%-%-es%'")
|
||||
}
|
||||
|
||||
|
|
@ -51,15 +54,25 @@ class SurveyItem < ActiveRecord::Base
|
|||
|
||||
scope :survey_type_for_grade, lambda { |school, academic_year, grade|
|
||||
survey_items_set_by_grade = survey_items_for_grade(school, academic_year, grade).pluck(:survey_item_id).to_set
|
||||
if survey_items_set_by_grade.size > 0 && survey_items_set_by_grade.subset?(early_education_surveys.pluck(:survey_item_id).to_set)
|
||||
if survey_items_set_by_grade.size > 0 && survey_items_set_by_grade.subset?(early_education_survey_items.pluck(:survey_item_id).to_set)
|
||||
return :early_education
|
||||
end
|
||||
|
||||
:regular
|
||||
:standard
|
||||
}
|
||||
|
||||
# TODO: rename this to Summary
|
||||
def description
|
||||
Summary.new(survey_item_id, prompt, true)
|
||||
end
|
||||
|
||||
def self.survey_type(survey_item_ids:)
|
||||
survey_item_ids = survey_item_ids.to_set
|
||||
return :short_form if survey_item_ids.subset? short_form_survey_items.map(&:survey_item_id).to_set
|
||||
return :early_education if survey_item_ids.subset? early_education_survey_items.map(&:survey_item_id).to_set
|
||||
return :teacher if survey_item_ids.subset? teacher_survey_items.map(&:survey_item_id).to_set
|
||||
return :standard if survey_item_ids.subset? standard_survey_items.map(&:survey_item_id).to_set
|
||||
|
||||
:unknown
|
||||
end
|
||||
end
|
||||
|
|
|
|||
123
app/services/cleaner.rb
Normal file
123
app/services/cleaner.rb
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
require 'fileutils'
|
||||
class Cleaner
|
||||
attr_reader :input_filepath, :output_filepath, :log_filepath, :clean_csv, :log_csv
|
||||
|
||||
def initialize(input_filepath:, output_filepath:, log_filepath:)
|
||||
@input_filepath = input_filepath
|
||||
@output_filepath = output_filepath
|
||||
@log_filepath = log_filepath
|
||||
initialize_directories
|
||||
end
|
||||
|
||||
def initialize_directories
|
||||
create_ouput_directory
|
||||
create_log_directory
|
||||
end
|
||||
|
||||
def clean
|
||||
Dir.glob(Rails.root.join(input_filepath, '*.csv')).each do |filepath|
|
||||
puts filepath
|
||||
File.open(filepath) do |file|
|
||||
clean_csv = []
|
||||
log_csv = []
|
||||
data = []
|
||||
|
||||
headers = CSV.parse(file.first).first
|
||||
filtered_headers = remove_unwanted_headers(headers:)
|
||||
log_headers = (filtered_headers + ['Valid Duration?', 'Valid Progress?', 'Valid Grade?',
|
||||
'Valid Standard Deviation?']).flatten
|
||||
|
||||
clean_csv << filtered_headers
|
||||
log_csv << log_headers
|
||||
|
||||
all_survey_items = survey_items(headers:)
|
||||
|
||||
file.lazy.each_slice(1000) do |lines|
|
||||
CSV.parse(lines.join, headers:).map do |row|
|
||||
values = SurveyItemValues.new(row:, headers:, genders:,
|
||||
survey_items: all_survey_items, schools:)
|
||||
next unless values.valid_school?
|
||||
|
||||
data << values
|
||||
values.valid? ? clean_csv << values.to_a : log_csv << (values.to_a << values.valid_duration?.to_s << values.valid_progress?.to_s << values.valid_grade?.to_s << values.valid_sd?.to_s)
|
||||
end
|
||||
end
|
||||
|
||||
filename = filename(headers:, data:)
|
||||
write_csv(data: clean_csv, output_filepath:, filename:)
|
||||
write_csv(data: log_csv, output_filepath: log_filepath, prefix: 'removed.', filename:)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def filename(headers:, data:)
|
||||
survey_item_ids = headers.filter(&:present?).filter do |header|
|
||||
header.start_with?('s-', 't-')
|
||||
end.reject { |item| item.end_with? '-1' }
|
||||
survey_type = SurveyItem.survey_type(survey_item_ids:)
|
||||
range = data.first.academic_year.range
|
||||
|
||||
districts = data.map do |row|
|
||||
row.district.name
|
||||
end.to_set.to_a
|
||||
|
||||
districts.join('.').to_s + '.' + survey_type.to_s + '.' + range + '.csv'
|
||||
end
|
||||
|
||||
def remove_unwanted_headers(headers:)
|
||||
headers.to_set.to_a.compact.reject do |item|
|
||||
item.start_with? 'Q'
|
||||
end.reject { |item| item.end_with? '-1' }
|
||||
end
|
||||
|
||||
def write_csv(data:, output_filepath:, filename:, prefix: '')
|
||||
csv = CSV.generate do |csv|
|
||||
data.each do |row|
|
||||
csv << row
|
||||
end
|
||||
end
|
||||
File.write(output_filepath.join(prefix + filename), csv)
|
||||
end
|
||||
|
||||
def process_row(row:)
|
||||
clean_csv << row.to_csv
|
||||
log_csv << row.to_csv
|
||||
end
|
||||
|
||||
def schools
|
||||
@schools ||= School.school_hash
|
||||
end
|
||||
|
||||
def genders
|
||||
@genders ||= begin
|
||||
gender_hash = {}
|
||||
|
||||
Gender.all.each do |gender|
|
||||
gender_hash[gender.qualtrics_code] = gender
|
||||
end
|
||||
gender_hash
|
||||
end
|
||||
end
|
||||
|
||||
def survey_items(headers:)
|
||||
@survey_items ||= SurveyItem.where(survey_item_id: get_survey_item_ids_from_headers(headers:))
|
||||
end
|
||||
|
||||
def get_survey_item_ids_from_headers(headers:)
|
||||
headers
|
||||
.filter(&:present?)
|
||||
.filter { |header| header.start_with? 't-', 's-' }
|
||||
end
|
||||
|
||||
def create_ouput_directory
|
||||
FileUtils.mkdir_p output_filepath
|
||||
end
|
||||
|
||||
def create_log_directory
|
||||
FileUtils.mkdir_p log_filepath
|
||||
end
|
||||
|
||||
def create_file(path:, filename:)
|
||||
FileUtils.touch path.join(filename)
|
||||
end
|
||||
end
|
||||
|
|
@ -71,7 +71,6 @@ module Dese
|
|||
likert_score = likert_score.round(2)
|
||||
end
|
||||
|
||||
# byebug if dese_id == 30_305
|
||||
output = []
|
||||
output << raw_likert_score
|
||||
output << likert_score
|
||||
|
|
|
|||
|
|
@ -8,7 +8,8 @@ class StudentLoader
|
|||
file.lazy.each_slice(1_000) do |lines|
|
||||
CSV.parse(lines.join, headers:).map do |row|
|
||||
next if rules.any? do |rule|
|
||||
rule.new(row: SurveyItemValues.new(row:, headers:, genders: nil, survey_items: nil)).skip_row?
|
||||
rule.new(row: SurveyItemValues.new(row:, headers:, genders: nil, survey_items: nil,
|
||||
schools:)).skip_row?
|
||||
end
|
||||
|
||||
process_row(row:)
|
||||
|
|
@ -27,7 +28,8 @@ class StudentLoader
|
|||
|
||||
CSV.parse(line, headers:).map do |row|
|
||||
next if rules.any? do |rule|
|
||||
rule.new(row: SurveyItemValues.new(row:, headers:, genders: nil, survey_items: nil)).skip_row?
|
||||
rule.new(row: SurveyItemValues.new(row:, headers:, genders: nil, survey_items: nil,
|
||||
schools:)).skip_row?
|
||||
end
|
||||
|
||||
process_row(row:)
|
||||
|
|
@ -45,6 +47,10 @@ class StudentLoader
|
|||
find_or_create_student(response_id:, lasid:, races:)
|
||||
end
|
||||
|
||||
def self.schools
|
||||
@schools ||= School.all.map { |school| [school.dese_id, school] }.to_h
|
||||
end
|
||||
|
||||
def self.race_codes(row:)
|
||||
race_codes = row['race'] || row['RACE'] || row['Race'] || row['What is your race/ethnicity?(Please select all that apply) - Selected Choice'] || row['What is your race/ethnicity?'] || '99'
|
||||
race_codes.split(',').map(&:to_i) || []
|
||||
|
|
|
|||
|
|
@ -1,11 +1,12 @@
|
|||
class SurveyItemValues
|
||||
attr_reader :row, :headers, :genders, :survey_items
|
||||
attr_reader :row, :headers, :genders, :survey_items, :schools
|
||||
|
||||
def initialize(row:, headers:, genders:, survey_items:)
|
||||
def initialize(row:, headers:, genders:, survey_items:, schools:)
|
||||
@row = row
|
||||
@headers = headers
|
||||
@genders = genders
|
||||
@survey_items = survey_items
|
||||
@schools = schools
|
||||
end
|
||||
|
||||
def dese_id?
|
||||
|
|
@ -13,7 +14,10 @@ class SurveyItemValues
|
|||
end
|
||||
|
||||
def response_date
|
||||
@response_date ||= Date.parse(row['Recorded Date'] || row['RecordedDate'])
|
||||
@response_date ||= begin
|
||||
recorded_date = value_from(pattern: /Recorded\s*Date/i)
|
||||
Date.parse(recorded_date)
|
||||
end
|
||||
end
|
||||
|
||||
def academic_year
|
||||
|
|
@ -41,11 +45,21 @@ class SurveyItemValues
|
|||
end
|
||||
|
||||
def response_id
|
||||
@response_id ||= row['Response ID'] || row['ResponseId'] || row['ResponseID']
|
||||
@response_id ||= value_from(pattern: /Response\s*ID/i)
|
||||
end
|
||||
|
||||
def dese_id
|
||||
@dese_id ||= (row['DESE ID' || 'Dese ID'] || row['DeseId'] || row['DeseID'] || row['School'] || row['school']).to_i
|
||||
@dese_id ||= begin
|
||||
dese_id = nil
|
||||
dese_headers = ['DESE ID', 'Dese ID', 'DeseId', 'DeseID', 'School', 'school']
|
||||
school_headers = headers.select { |header| /School-\s\w/.match(header) }
|
||||
dese_headers << school_headers
|
||||
dese_headers.flatten.each do |header|
|
||||
dese_id ||= row[header]
|
||||
end
|
||||
|
||||
dese_id.to_i
|
||||
end
|
||||
end
|
||||
|
||||
def likert_score(survey_item_id:)
|
||||
|
|
@ -56,13 +70,14 @@ class SurveyItemValues
|
|||
@school ||= schools[dese_id]
|
||||
end
|
||||
|
||||
def schools
|
||||
@schools ||= School.all.map { |school| [school.dese_id, school] }.to_h
|
||||
def district
|
||||
@district ||= school&.district
|
||||
end
|
||||
|
||||
def grade
|
||||
@grade ||= begin
|
||||
raw_grade = (row['grade'] || row['Grade'] || row['What grade are you in?'])
|
||||
raw_grade = value_from(pattern: /Grade|What grade are you in?/i)
|
||||
|
||||
return nil if raw_grade.blank?
|
||||
|
||||
raw_grade.to_i
|
||||
|
|
@ -70,10 +85,105 @@ class SurveyItemValues
|
|||
end
|
||||
|
||||
def gender
|
||||
gender_code = row['gender'] || row['Gender'] || row['What is your gender?'] || row['What is your gender? - Selected Choice'] || 99
|
||||
gender_code = value_from(pattern: /Gender|What is your gender?|What is your gender? - Selected Choice/i)
|
||||
gender_code ||= 99
|
||||
gender_code = gender_code.to_i
|
||||
gender_code = 4 if gender_code == 3
|
||||
gender_code = 99 if gender_code.zero?
|
||||
genders[gender_code]
|
||||
end
|
||||
|
||||
def value_from(pattern:)
|
||||
output = nil
|
||||
matches = headers.select do |header|
|
||||
pattern.match(header)
|
||||
end.map { |item| item.delete("\n") }
|
||||
matches.each do |match|
|
||||
output ||= row[match]
|
||||
end
|
||||
output
|
||||
end
|
||||
|
||||
def to_a
|
||||
copy_likert_scores_from_variant_survey_items
|
||||
row.remove_unwanted_columns
|
||||
end
|
||||
|
||||
def duration
|
||||
@duration ||= value_from(pattern: /Duration|Duration \(in seconds\)|Duration\.\.\(in\.seconds\)/i).to_i
|
||||
end
|
||||
|
||||
def valid?
|
||||
valid_duration? && valid_progress? && valid_grade? && valid_sd?
|
||||
end
|
||||
|
||||
def survey_type
|
||||
return :teacher if headers
|
||||
.filter(&:present?)
|
||||
.filter { |header| header.start_with? 't-' }.count > 0
|
||||
|
||||
:student
|
||||
end
|
||||
|
||||
def valid_duration?
|
||||
return duration >= 300 if survey_type == :teacher
|
||||
|
||||
duration >= 240
|
||||
end
|
||||
|
||||
def valid_progress?
|
||||
row['Progress'].to_i >= 25
|
||||
end
|
||||
|
||||
def valid_grade?
|
||||
return true if grade.nil?
|
||||
|
||||
return true if survey_type == :teacher
|
||||
|
||||
respondents = Respondent.where(school:, academic_year:).first
|
||||
if respondents.present? && respondents.counts_by_grade[grade].present?
|
||||
enrollment = respondents.counts_by_grade[grade]
|
||||
end
|
||||
return false if enrollment.nil?
|
||||
|
||||
valid = enrollment > 0
|
||||
puts "Invalid grade #{grade} for #{school.name} #{academic_year.formatted_range}" unless valid
|
||||
valid
|
||||
end
|
||||
|
||||
def valid_sd?
|
||||
survey_item_headers = headers.filter(&:present?).filter { |header| header.start_with?('s-', 't-') }
|
||||
likert_scores = []
|
||||
survey_item_headers.each do |header|
|
||||
likert_scores << likert_score(survey_item_id: header).to_i
|
||||
end
|
||||
likert_scores = likert_scores.compact.reject(&:zero?)
|
||||
return false if likert_scores.count < 2
|
||||
|
||||
!likert_scores.stdev.zero?
|
||||
end
|
||||
|
||||
def valid_school?
|
||||
school.present?
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def copy_likert_scores_from_variant_survey_items
|
||||
headers.filter(&:present?).filter { |header| header.end_with? '-1' }.each do |header|
|
||||
likert_score = row[header]
|
||||
main_item = header.gsub('-1', '')
|
||||
row[main_item] = likert_score if likert_score.present?
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
module RowMonkeyPatches
|
||||
def remove_unwanted_columns
|
||||
to_h.filter do |key, _value|
|
||||
key.present?
|
||||
end.reject { |key, _value| key.start_with? 'Q' }.reject { |key, _value| key.end_with? '-1' }.values
|
||||
end
|
||||
end
|
||||
|
||||
CSV::Row.include RowMonkeyPatches
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ class SurveyResponsesDataLoader
|
|||
|
||||
file.lazy.each_slice(500) do |lines|
|
||||
survey_item_responses = CSV.parse(lines.join, headers:).map do |row|
|
||||
process_row(row: SurveyItemValues.new(row:, headers:, genders: genders_hash, survey_items: all_survey_items),
|
||||
process_row(row: SurveyItemValues.new(row:, headers: headers.split(','), genders: genders_hash, survey_items: all_survey_items, schools:),
|
||||
rules:)
|
||||
end
|
||||
SurveyItemResponse.import survey_item_responses.compact.flatten, batch_size: 500
|
||||
|
|
@ -29,7 +29,7 @@ class SurveyResponsesDataLoader
|
|||
next unless line.present?
|
||||
|
||||
CSV.parse(line, headers:).map do |row|
|
||||
survey_item_responses << process_row(row: SurveyItemValues.new(row:, headers:, genders: genders_hash, survey_items: all_survey_items),
|
||||
survey_item_responses << process_row(row: SurveyItemValues.new(row:, headers: headers.split(','), genders: genders_hash, survey_items: all_survey_items, schools:),
|
||||
rules:)
|
||||
end
|
||||
|
||||
|
|
@ -54,6 +54,7 @@ class SurveyResponsesDataLoader
|
|||
return if rule.new(row:).skip_row?
|
||||
end
|
||||
|
||||
# byebug if row.response_id == 'butler_student_survey_response_1'
|
||||
process_survey_items(row:)
|
||||
end
|
||||
|
||||
|
|
@ -82,13 +83,12 @@ class SurveyResponsesDataLoader
|
|||
end
|
||||
end
|
||||
|
||||
def self.genders
|
||||
gender_hash = {}
|
||||
def self.schools
|
||||
School.school_hash
|
||||
end
|
||||
|
||||
Gender.all.each do |gender|
|
||||
gender_hash[gender.qualtrics_code] = gender
|
||||
end
|
||||
gender_hash
|
||||
def self.genders
|
||||
Gender.gender_hash
|
||||
end
|
||||
|
||||
def self.survey_items(headers:)
|
||||
|
|
@ -96,9 +96,9 @@ class SurveyResponsesDataLoader
|
|||
end
|
||||
|
||||
def self.get_survey_item_ids_from_headers(headers:)
|
||||
CSV.parse(headers, headers: true).headers
|
||||
.filter(&:present?)
|
||||
.filter { |header| header.start_with? 't-' or header.start_with? 's-' }
|
||||
headers.split(',')
|
||||
.filter(&:present?)
|
||||
.filter { |header| header.start_with? 't-', 's-' }
|
||||
end
|
||||
|
||||
private_class_method :process_row
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue