Add automated data cleaning. Modify SurveyItemValues class to use regex

instead of hard coded values.  Produce a clean csv and a csv with all
the removed values and columns with reason for removal. Add script for
running cleaning for each project
This commit is contained in:
rebuilt 2023-05-02 20:08:34 -07:00
parent b5dc933187
commit 0dfc9726d0
21 changed files with 1214 additions and 149 deletions

12
scripts/prep_for_cleaning.sh Executable file
View file

@ -0,0 +1,12 @@
#!/usr/bin/env sh
set -eux
# From lines 2 to 70, delete lines that start with "Start Date", lines that start with '"ImportId"' and lines that start with spaces
#
# For example:
# Start Date,End Date,Response Type,IP Address,Progress,Duration (in seconds),Finished,Recorded Date,Response ID,Recipient Last Name,Recipient First Name,Recipient Email,External Data Reference,Location Latitude,Location Longitude,Distribution Channel,User Language,Please select your school district.,Please select your school in Lee.,Please select your school in Maynard,"Which of the following best describes the role you play at your school? Please select only 1 choice. (Please note: this question is used only to determine survey flow - i.e., some questions will be omitted based on the role you select. ECP will never report survey results according to teacher role.)",Given your preparation for teaching how comfortable are you teaching at the grade-level you have been assigned?,"How prepared are you for teaching the
# topics that you are expected to teach in your assignment?","How confident are you in working with the
# "{""ImportId"":""startDate"",""timeZone"":""America/New_York""}","{""ImportId"":""endDate"",""timeZone"":""America/New_York""}","{""ImportId"":""status""}","{""ImportId"":""ipAddress""}","{""ImportId"":""progress""}","{""ImportId"":""duration""}","{""ImportId"":""finished""}","{""ImportId"":""recordedDate"",""timeZone"":""America/New_York""}","{""ImportId"":""_recordId""}","{""ImportId"":""recipientLastName""}","{""ImportId"":""recipientFirstName""}","{""ImportId"":""recipientEmail""}","{""ImportId"":""externalDataReference""}","{""ImportId"":""locationLatitude""}","{""ImportId"":""locationLongitude""}","{""ImportId"":""distributionChannel""}","{""ImportId"":""userLanguage""}","{""ImportId"":""QID142""}","{""ImportId"":""QID140""}","{""ImportId"":""QID141""}","{""ImportId"":""QID139""}","{""ImportId"":""QID9""}","{""ImportId"":""QID10""}","{""ImportId"":""QID11""}","{""ImportId"":""QID12""}","{""ImportId"":""QID13""}","{""ImportId"":""QID18""}","{""ImportId"":""QID19""}","{""ImportId"":""QID20""}","{""ImportId"":""QID131""}","{""ImportId"":""QID132""}","{""ImportId"":""QID133""}","{""ImportId"":""QID135""}","{""ImportId"":""QID22""}","{""ImportId"":""QID23""}","{""ImportId"":""QID24""}","{""ImportId"":""QID25""}","{""ImportId"":""QID26""}","{""ImportId"":""QID33""}","{""ImportId"":""QID34""}","{""ImportId"":""QID35""}","{""ImportId"":""QID36""}","{""ImportId"":""QID37""}","{""ImportId"":""QID39""}","{""ImportId"":""QID40""}","{""ImportId"":""QID42""}","{""ImportId"":""QID43""}","{""ImportId"":""QID44""}","{""ImportId"":""QID45""}","{""ImportId"":""QID46""}","{""ImportId"":""QID47""}","{""ImportId"":""QID48""}","{""ImportId"":""QID49""}","{""ImportId"":""QID108""}","{""ImportId"":""QID109""}","{""ImportId"":""QID110""}","{""ImportId"":""QID50""}","{""ImportId"":""QID51""}","{""ImportId"":""QID52""}","{""ImportId"":""QID53""}","{""ImportId"":""QID54""}","{""ImportId"":""QID55""}","{""ImportId"":""QID56""}","{""ImportId"":""QID57""}","{""ImportId"":""QID58""}","{""ImportId"":""QID63""}","{""ImportId"":""QID64""}","{""ImportId"":""QID65""}","{""ImportId"":""QID66""}","{""ImportId"":""QID67""}","{""ImportId"":""QID69""}","{""ImportId"":""QID70""}","{""ImportId"":""QID71""}","{""ImportId"":""QID72""}","{""ImportId"":""QID73""}","{""ImportId"":""QID74""}","{""ImportId"":""QID75""}","{""ImportId"":""QID79""}","{""ImportId"":""QID80""}","{""ImportId"":""QID81""}","{""ImportId"":""QID83""}","{""ImportId"":""QID84""}","{""ImportId"":""QID86""}","{""ImportId"":""QID87""}","{""ImportId"":""QID59""}","{""ImportId"":""QID60""}","{""ImportId"":""QID61""}","{""ImportId"":""QID62""}","{""ImportId"":""QID76""}","{""ImportId"":""QID77""}","{""ImportId"":""QID78""}","{""ImportId"":""QID91_TEXT""}","{""ImportId"":""QID92_TEXT""}","{""ImportId"":""QID89_1""}"
sed '2,70{/^Start Date/d;/^"{""ImportId""/d;/^\s/d;}' $1 > prepped_$1