From ab5d2cbbca31973cad13040439b03afc8828fb7b Mon Sep 17 00:00:00 2001 From: Gabe Farrell Date: Wed, 26 Apr 2023 14:25:02 -0400 Subject: [PATCH] More efficient merging, Verbose, Fix columns --- README.md | Bin 1302 -> 1410 bytes merge-csv.py | 69 ++++++++++++++++++++++++++------------------------- 2 files changed, 35 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index d3f39f9937e820c95983e3f84128e7c58169bbba..33771924fb8a1208757a85352a52b8a1db481f29 100644 GIT binary patch delta 117 zcmbQn)x^C)i;=NxvnHb~Q(_r|4ub-NE)bS6q%ssSBr)VO6a!fbP*}uJ%8&0Fw$p6?qJ4K(Qi*T%c?Tke| 15: +# return [s] +# lu_sequence = ((c.lower(), c.upper()) for c in s) +# return [''.join(x) for x in it.product(*lu_sequence)] -def cap_permutations(s): - if len(s) > 15: - return [s] - lu_sequence = ((c.lower(), c.upper()) for c in s) - return [''.join(x) for x in it.product(*lu_sequence)] - -def combine_rows(df, col, possibilities): +def combine_cols(df, col, possibilities): # if final column doesn't exist, create it if col not in df.columns: tmpdf = pd.DataFrame([np.nan], columns=[col]) df = pd.concat((df, tmpdf), axis=1) - # generate all upper/lowercase possibilities for columns - allp = [] - for p in possibilities: - allp += cap_permutations(p) - # also have to remove the final column from the possibilities - safety = 0 - while col in allp: - allp.remove(col) - safety += 1 - if safety > 100: - print(f'Fatal: Infinite loop detected, shutting down.') - exit(1) # list to store replaced columns drops = [] # for every column possibility that does exist... - for c in allp: - if c in df.columns: + for cl in df.columns: + if cl.lower() in possibilities: + # we don't want to merge and drop our final column + if cl == col: + continue # replace the column... - # print(f'Replacing column {c}') - df[col] = df[col].replace(r'^\s*$', np.nan, regex=True).fillna(df[c]) + if args.verbose: print(f'Replacing column {cl}') + df[col] = df[col].replace(r'^\s*$', np.nan, regex=True).fillna(df[cl]) # and add it to the drop list - drops.append(c) + drops.append(cl) # drop spent columns df = df.drop(columns=drops) - # print(f'Dropped columns: {drops}') + if args.verbose: print(f'Dropped columns: {drops}') return df @@ -209,19 +202,19 @@ def do_merge_teacher(cwd, mwd): def repair_teacher_rows(df): - df = combine_rows(df, 'Recorded Date', ['recorded date', 'recordeddate']) - df = combine_rows(df, 'Response ID', ['Responseid', 'Response id']) - df = combine_rows(df, 'DeseId', ['deseid', 'dese id', 'school']) + df = combine_cols(df, 'Recorded Date', ['recorded date', 'recordeddate']) + df = combine_cols(df, 'Response ID', ['responseid', 'response id']) + df = combine_cols(df, 'DeseId', ['deseid', 'dese id', 'school']) return df def repair_student_rows(df): - df = combine_rows(df, 'Recorded Date', ['recorded date', 'recordeddate']) - df = combine_rows(df, 'Response ID', ['Responseid', 'Response id']) - df = combine_rows(df, 'DeseId', ['deseid', 'dese id', 'school']) - df = combine_rows(df, 'Grade', ['grade', 'What grade are you in?']) - df = combine_rows(df, 'Gender', ['gender', 'Gender - self report', 'What is your gender?', 'What is your gender? - Selected Choice']) - df = combine_rows(df, 'Race', ['Race- self report', 'race', 'Race - self report']) + df = combine_cols(df, 'Recorded Date', ['recorded date', 'recordeddate']) + df = combine_cols(df, 'Response ID', ['responseid', 'response id']) + df = combine_cols(df, 'DeseId', ['deseid', 'dese id', 'school']) + df = combine_cols(df, 'Grade', ['grade', 'what grade are you in?']) + df = combine_cols(df, 'Gender', ['gender', 'what is your gender?', 'what is your gender? - selected choice']) + df = combine_cols(df, 'Race', ['race']) if not args.quiet: print('Combining Question Variants...') df = combine_variants(df) return df @@ -264,6 +257,10 @@ if __name__ == '__main__': action='store_true', dest='quiet', help='run without output (besides errors and warnings)') + parser.add_argument('-v', '--verbose', + action='store_true', + dest='verbose', + help='run with extra output information') parser.add_argument('-p', '--project', action='store', help='add a project name to the merged csv file name') @@ -273,6 +270,10 @@ if __name__ == '__main__': help='sftp url for remote merging') args = parser.parse_args() + #quiet takes precedence over verbose + if args.quiet: + args.verbose = False + # make sure -s or -t is set if not (args.student or args.teacher): if not args.quiet: print('Notice: Neither -s nor -t are specified. No merge will be performed.')