|
|
|
@ -103,50 +103,43 @@ def prep_dir(folder=''):
|
|
|
|
mwd = os.path.join(cwd, 'merged')
|
|
|
|
mwd = os.path.join(cwd, 'merged')
|
|
|
|
if not os.path.exists(mwd):
|
|
|
|
if not os.path.exists(mwd):
|
|
|
|
os.mkdir(mwd)
|
|
|
|
os.mkdir(mwd)
|
|
|
|
|
|
|
|
if args.verbose: print('Source data directory: ' + cwd)
|
|
|
|
|
|
|
|
if args.verbose: print('Merged data directory: ' + mwd)
|
|
|
|
return cwd, mwd
|
|
|
|
return cwd, mwd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_date():
|
|
|
|
def get_date():
|
|
|
|
return datetime.date.today().strftime("%B-%d-%Y")
|
|
|
|
return datetime.date.today().strftime("%B-%d-%Y")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# UNUSED
|
|
|
|
|
|
|
|
# def cap_permutations(s):
|
|
|
|
|
|
|
|
# if len(s) > 15:
|
|
|
|
|
|
|
|
# return [s]
|
|
|
|
|
|
|
|
# lu_sequence = ((c.lower(), c.upper()) for c in s)
|
|
|
|
|
|
|
|
# return [''.join(x) for x in it.product(*lu_sequence)]
|
|
|
|
|
|
|
|
|
|
|
|
def cap_permutations(s):
|
|
|
|
|
|
|
|
if len(s) > 15:
|
|
|
|
|
|
|
|
return [s]
|
|
|
|
|
|
|
|
lu_sequence = ((c.lower(), c.upper()) for c in s)
|
|
|
|
|
|
|
|
return [''.join(x) for x in it.product(*lu_sequence)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def combine_cols(df, col, possibilities):
|
|
|
|
def combine_rows(df, col, possibilities):
|
|
|
|
|
|
|
|
# if final column doesn't exist, create it
|
|
|
|
# if final column doesn't exist, create it
|
|
|
|
if col not in df.columns:
|
|
|
|
if col not in df.columns:
|
|
|
|
tmpdf = pd.DataFrame([np.nan], columns=[col])
|
|
|
|
tmpdf = pd.DataFrame([np.nan], columns=[col])
|
|
|
|
df = pd.concat((df, tmpdf), axis=1)
|
|
|
|
df = pd.concat((df, tmpdf), axis=1)
|
|
|
|
# generate all upper/lowercase possibilities for columns
|
|
|
|
|
|
|
|
allp = []
|
|
|
|
|
|
|
|
for p in possibilities:
|
|
|
|
|
|
|
|
allp += cap_permutations(p)
|
|
|
|
|
|
|
|
# also have to remove the final column from the possibilities
|
|
|
|
|
|
|
|
safety = 0
|
|
|
|
|
|
|
|
while col in allp:
|
|
|
|
|
|
|
|
allp.remove(col)
|
|
|
|
|
|
|
|
safety += 1
|
|
|
|
|
|
|
|
if safety > 100:
|
|
|
|
|
|
|
|
print(f'Fatal: Infinite loop detected, shutting down.')
|
|
|
|
|
|
|
|
exit(1)
|
|
|
|
|
|
|
|
# list to store replaced columns
|
|
|
|
# list to store replaced columns
|
|
|
|
drops = []
|
|
|
|
drops = []
|
|
|
|
# for every column possibility that does exist...
|
|
|
|
# for every column possibility that does exist...
|
|
|
|
for c in allp:
|
|
|
|
for cl in df.columns:
|
|
|
|
if c in df.columns:
|
|
|
|
if cl.lower() in possibilities:
|
|
|
|
|
|
|
|
# we don't want to merge and drop our final column
|
|
|
|
|
|
|
|
if cl == col:
|
|
|
|
|
|
|
|
continue
|
|
|
|
# replace the column...
|
|
|
|
# replace the column...
|
|
|
|
# print(f'Replacing column {c}')
|
|
|
|
if args.verbose: print(f'Replacing column {cl}')
|
|
|
|
df[col] = df[col].replace(r'^\s*$', np.nan, regex=True).fillna(df[c])
|
|
|
|
df[col] = df[col].replace(r'^\s*$', np.nan, regex=True).fillna(df[cl])
|
|
|
|
# and add it to the drop list
|
|
|
|
# and add it to the drop list
|
|
|
|
drops.append(c)
|
|
|
|
drops.append(cl)
|
|
|
|
# drop spent columns
|
|
|
|
# drop spent columns
|
|
|
|
df = df.drop(columns=drops)
|
|
|
|
df = df.drop(columns=drops)
|
|
|
|
# print(f'Dropped columns: {drops}')
|
|
|
|
if args.verbose: print(f'Dropped columns: {drops}')
|
|
|
|
return df
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -209,19 +202,19 @@ def do_merge_teacher(cwd, mwd):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def repair_teacher_rows(df):
|
|
|
|
def repair_teacher_rows(df):
|
|
|
|
df = combine_rows(df, 'Recorded Date', ['recorded date', 'recordeddate'])
|
|
|
|
df = combine_cols(df, 'Recorded Date', ['recorded date', 'recordeddate'])
|
|
|
|
df = combine_rows(df, 'Response ID', ['Responseid', 'Response id'])
|
|
|
|
df = combine_cols(df, 'Response ID', ['responseid', 'response id'])
|
|
|
|
df = combine_rows(df, 'DeseId', ['deseid', 'dese id', 'school'])
|
|
|
|
df = combine_cols(df, 'DeseId', ['deseid', 'dese id', 'school'])
|
|
|
|
return df
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def repair_student_rows(df):
|
|
|
|
def repair_student_rows(df):
|
|
|
|
df = combine_rows(df, 'Recorded Date', ['recorded date', 'recordeddate'])
|
|
|
|
df = combine_cols(df, 'Recorded Date', ['recorded date', 'recordeddate'])
|
|
|
|
df = combine_rows(df, 'Response ID', ['Responseid', 'Response id'])
|
|
|
|
df = combine_cols(df, 'Response ID', ['responseid', 'response id'])
|
|
|
|
df = combine_rows(df, 'DeseId', ['deseid', 'dese id', 'school'])
|
|
|
|
df = combine_cols(df, 'DeseId', ['deseid', 'dese id', 'school'])
|
|
|
|
df = combine_rows(df, 'Grade', ['grade', 'What grade are you in?'])
|
|
|
|
df = combine_cols(df, 'Grade', ['grade', 'what grade are you in?'])
|
|
|
|
df = combine_rows(df, 'Gender', ['gender', 'Gender - self report', 'What is your gender?', 'What is your gender? - Selected Choice'])
|
|
|
|
df = combine_cols(df, 'Gender', ['gender', 'what is your gender?', 'what is your gender? - selected choice'])
|
|
|
|
df = combine_rows(df, 'Race', ['Race- self report', 'race', 'Race - self report'])
|
|
|
|
df = combine_cols(df, 'Race', ['race'])
|
|
|
|
if not args.quiet: print('Combining Question Variants...')
|
|
|
|
if not args.quiet: print('Combining Question Variants...')
|
|
|
|
df = combine_variants(df)
|
|
|
|
df = combine_variants(df)
|
|
|
|
return df
|
|
|
|
return df
|
|
|
|
@ -264,6 +257,10 @@ if __name__ == '__main__':
|
|
|
|
action='store_true',
|
|
|
|
action='store_true',
|
|
|
|
dest='quiet',
|
|
|
|
dest='quiet',
|
|
|
|
help='run without output (besides errors and warnings)')
|
|
|
|
help='run without output (besides errors and warnings)')
|
|
|
|
|
|
|
|
parser.add_argument('-v', '--verbose',
|
|
|
|
|
|
|
|
action='store_true',
|
|
|
|
|
|
|
|
dest='verbose',
|
|
|
|
|
|
|
|
help='run with extra output information')
|
|
|
|
parser.add_argument('-p', '--project',
|
|
|
|
parser.add_argument('-p', '--project',
|
|
|
|
action='store',
|
|
|
|
action='store',
|
|
|
|
help='add a project name to the merged csv file name')
|
|
|
|
help='add a project name to the merged csv file name')
|
|
|
|
@ -273,6 +270,10 @@ if __name__ == '__main__':
|
|
|
|
help='sftp url for remote merging')
|
|
|
|
help='sftp url for remote merging')
|
|
|
|
args = parser.parse_args()
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#quiet takes precedence over verbose
|
|
|
|
|
|
|
|
if args.quiet:
|
|
|
|
|
|
|
|
args.verbose = False
|
|
|
|
|
|
|
|
|
|
|
|
# make sure -s or -t is set
|
|
|
|
# make sure -s or -t is set
|
|
|
|
if not (args.student or args.teacher):
|
|
|
|
if not (args.student or args.teacher):
|
|
|
|
if not args.quiet: print('Notice: Neither -s nor -t are specified. No merge will be performed.')
|
|
|
|
if not args.quiet: print('Notice: Neither -s nor -t are specified. No merge will be performed.')
|
|
|
|
|