Add automated data cleaning. Modify SurveyItemValues class to use regex

instead of hard coded values.  Produce a clean csv and a csv with all
the removed values and columns with reason for removal. Add script for
running cleaning for each project
This commit is contained in:
rebuilt 2023-05-02 20:08:34 -07:00
parent 5cf5a5f383
commit 4509c157fa
20 changed files with 1148 additions and 154 deletions

33
lib/tasks/clean.rake Normal file
View file

@ -0,0 +1,33 @@
namespace :clean do
# These tasks must be run in their respective project so the correct schools are in the database
desc 'clean ecp data'
task ecp: :environment do
input_filepath = Rails.root.join('tmp', 'data', 'ecp_data', 'raw')
output_filepath = Rails.root.join('tmp', 'data', 'ecp_data', 'clean')
log_filepath = Rails.root.join('tmp', 'data', 'ecp_data', 'removed')
Cleaner.new(input_filepath:, output_filepath:, log_filepath:).clean
end
desc 'clean prepped data'
task prepped: :environment do
input_filepath = Rails.root.join('tmp', 'data', 'ecp_data', 'prepped')
output_filepath = Rails.root.join('tmp', 'data', 'ecp_data', 'prepped', 'clean')
log_filepath = Rails.root.join('tmp', 'data', 'ecp_data', 'prepped', 'removed')
Cleaner.new(input_filepath:, output_filepath:, log_filepath:).clean
end
desc 'clean mciea data'
task mciea: :environment do
input_filepath = Rails.root.join('tmp', 'data', 'mciea_data', 'raw')
output_filepath = Rails.root.join('tmp', 'data', 'mciea_data', 'clean')
log_filepath = Rails.root.join('tmp', 'data', 'mciea_data', 'removed')
Cleaner.new(input_filepath:, output_filepath:, log_filepath:).clean
end
desc 'clean rpp data'
task rpp: :environment do
input_filepath = Rails.root.join('tmp', 'data', 'rpp_data', 'raw')
output_filepath = Rails.root.join('tmp', 'data', 'rpp_data', 'clean')
log_filepath = Rails.root.join('tmp', 'data', 'rpp_data', 'removed')
Cleaner.new(input_filepath:, output_filepath:, log_filepath:).clean
end
end