From 2ea21c767ff37e982cb7a84723d6baf4da19d509 Mon Sep 17 00:00:00 2001 From: Gabe Farrell Date: Thu, 20 Apr 2023 22:38:39 -0400 Subject: [PATCH] Merge Student Data --- merge-csv.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/merge-csv.py b/merge-csv.py index f29cad0..2859454 100644 --- a/merge-csv.py +++ b/merge-csv.py @@ -6,6 +6,7 @@ import numpy as np import datetime import itertools as it import argparse +import re import pprint @@ -23,6 +24,8 @@ def get_date(): def cap_permutations(s): + if len(s) > 15: + return [s] lu_sequence = ((c.lower(), c.upper()) for c in s) return [''.join(x) for x in it.product(*lu_sequence)] @@ -36,8 +39,13 @@ def combine_rows(df, col, possibilities): for p in possibilities: allp += cap_permutations(p) # also have to remove the final column from the possibilities + safety = 0 while col in allp: allp.remove(col) + safety += 1 + if safety > 100: + print(f'Infinite loop detected, shutting down.') + exit(1) # list to store replaced columns drops = [] # for every column possibility that does exist... @@ -56,11 +64,22 @@ def combine_rows(df, col, possibilities): def do_merge_student(cwd, mwd): # identify and merge student files + print('---Merging Student Data---') all_files = glob.glob(os.path.join(cwd, "*student*.csv")) - print(all_files) - df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True) + print(f'Found {len(all_files)} CSV files') + print('Merging...') + files = [pd.read_csv(f) for f in all_files] + lines = 0 + for f in files: + lines += f.shape[0] + df = pd.concat(files, ignore_index=True) + print('Repairing rows...') + df = repair_student_rows(df) + if df.shape[0] != lines: + print(f'Warning! Line count mismatch: {lines} expected, but got {df.shape[0]}') date = get_date() df.to_csv(os.path.join(mwd, f'{date}-student-data-merged.csv')) + print('Student data merged successfully!') def do_merge_teacher(cwd, mwd): @@ -90,6 +109,33 @@ def repair_teacher_rows(df): return df +def repair_student_rows(df): + df = combine_rows(df, 'Recorded Date', ['recorded date', 'recordeddate']) + df = combine_rows(df, 'Response ID', ['Responseid', 'Response id']) + df = combine_rows(df, 'DeseId', ['deseid', 'dese id', 'school']) + df = combine_rows(df, 'Grade', ['grade', 'What grade are you in?']) + df = combine_rows(df, 'Gender', ['gender', 'Gender - self report', 'What is your gender?', 'What is your gender? - Selected Choice']) + df = combine_rows(df, 'Race', ['Race- self report', 'race', 'Race - self report']) + print('Combining Question Variants...') + df = combine_variants(df) + return df + + +def combine_variants(df): + drops = [] + for col in df: + x = re.search(r's-[a-z]{4}-q[0-9]-1', col) + if x is not None: + # get non variant version + nonvar = col[:-2] + # combine into non variant + df[nonvar] = df[nonvar].replace(r'^\s*$', np.nan, regex=True).fillna(df[col]) + # and add it to the drop list + drops.append(col) + df = df.drop(columns=drops) + return df + + if __name__ == '__main__': # load environment vars load_dotenv() @@ -117,5 +163,7 @@ if __name__ == '__main__': c, m = prep_dir(args.folder) if args.teacher: do_merge_teacher(c, m) - # if args.student: - # do_merge_student(c, m) + if args.student: + do_merge_student(c, m) + +# TODO: Regex match cols with title s-****-q#-1 and merge with col s-****-q# \ No newline at end of file