From 7727b77cdae664a9aae958474d342827e355e78a Mon Sep 17 00:00:00 2001 From: Gabe Farrell Date: Thu, 20 Apr 2023 14:13:10 -0400 Subject: [PATCH] Merge Teacher CSVs --- .gitignore | 4 ++ README.md | Bin 0 -> 844 bytes merge-csv.py | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 125 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 merge-csv.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..16768bd --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.env +.idea +test-csv/merged +ecp-csv \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e8eb477804d642e7a37d188f8c160a129e3536df GIT binary patch literal 844 zcmaKr;ZB1<5QO))iSKYp|7arZGnnWj_+JXOkf0Et+P=K{&0gTJA?5Vs2kY&~JF~(D`?OoT@n>U-KN0D8KCJY2VKw{U8Ly_bx14k3HyzQAU)AIfHWF!A zfzt`5&MyCEDL#VKlApoXauRdY6xpOMDtmz;cjdZU(~{8E>fE}XV0UmyYi0K+)v07J zsiB9xAky-6A)0uSibr=()k*0HQp#F76Tq$P0p!fT4)`LcrX>ek`{FDmH8_EN$`bm( ziuDF|^&@b5CUePw)Qe}_$bHTl^se_|Ib#VkP5(3%aVF%2 ReBRp5JyX5Y|75pd_yu&|h%*2H literal 0 HcmV?d00001 diff --git a/merge-csv.py b/merge-csv.py new file mode 100644 index 0000000..f29cad0 --- /dev/null +++ b/merge-csv.py @@ -0,0 +1,121 @@ +import pandas as pd +import os +import glob +from dotenv import load_dotenv +import numpy as np +import datetime +import itertools as it +import argparse +import pprint + + +def prep_dir(folder): + # prepare directories + cwd = os.path.join(os.getcwd(), folder) + mwd = os.path.join(cwd, 'merged') + if not os.path.exists(mwd): + os.mkdir(mwd) + return cwd, mwd + + +def get_date(): + return datetime.date.today().strftime("%B-%d-%Y") + + +def cap_permutations(s): + lu_sequence = ((c.lower(), c.upper()) for c in s) + return [''.join(x) for x in it.product(*lu_sequence)] + + +def combine_rows(df, col, possibilities): + # if final column doesn't exist, create it + if col not in df.columns: + df[col] = np.nan + # generate all upper/lowercase possibilities for columns + allp = [] + for p in possibilities: + allp += cap_permutations(p) + # also have to remove the final column from the possibilities + while col in allp: + allp.remove(col) + # list to store replaced columns + drops = [] + # for every column possibility that does exist... + for c in allp: + if c in df.columns: + # replace the column... + # print(f'Replacing column {c}') + df[col] = df[col].replace(r'^\s*$', np.nan, regex=True).fillna(df[c]) + # and add it to the drop list + drops.append(c) + # drop spent columns + df = df.drop(columns=drops) + # print(f'Dropped columns: {drops}') + return df + + +def do_merge_student(cwd, mwd): + # identify and merge student files + all_files = glob.glob(os.path.join(cwd, "*student*.csv")) + print(all_files) + df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True) + date = get_date() + df.to_csv(os.path.join(mwd, f'{date}-student-data-merged.csv')) + + +def do_merge_teacher(cwd, mwd): + # identify and merge teacher files + print('---Merging Teacher Data---') + all_files = glob.glob(os.path.join(cwd, "*teacher*.csv")) + print(f'Found {len(all_files)} CSV files') + print('Merging...') + files = [pd.read_csv(f) for f in all_files] + lines = 0 + for f in files: + lines += f.shape[0] + df = pd.concat(files, ignore_index=True) + print('Repairing rows...') + df = repair_teacher_rows(df) + if df.shape[0] != lines: + print(f'Warning! Line count mismatch: {lines} expected, but got {df.shape[0]}') + date = get_date() + df.to_csv(os.path.join(mwd, f'{date}-teacher-data-merged.csv')) + print('Teacher data merged successfully!') + + +def repair_teacher_rows(df): + df = combine_rows(df, 'Recorded Date', ['recorded date', 'recordeddate']) + df = combine_rows(df, 'Response ID', ['Responseid', 'Response id']) + df = combine_rows(df, 'DeseId', ['deseid', 'dese id', 'school']) + return df + + +if __name__ == '__main__': + # load environment vars + load_dotenv() + # parse flags + parser = argparse.ArgumentParser( + prog='merge-csv', + description='Merges CSV Files containing student and teacher data', + epilog='Usage: python merge-csv.py (-sth) (directory)') + parser.add_argument('-d', '--folder', + action='store', + help='directory for local csv merging') + parser.add_argument('-t', '--teacher', + action='store_true', + dest='teacher', + help='merge teacher data') # only merge teacher data + parser.add_argument('-s', '--student', + action='store_true', + dest='student', + help='merge student data') # on/off flag + args = parser.parse_args() + # make sure -s or -t is set + if not (args.student or args.teacher): + print('Warning: Neither -s nor -t are specified. No merge will be performed.') + # do merge + c, m = prep_dir(args.folder) + if args.teacher: + do_merge_teacher(c, m) + # if args.student: + # do_merge_student(c, m)