From 2ea21c767ff37e982cb7a84723d6baf4da19d509 Mon Sep 17 00:00:00 2001
From: Gabe Farrell <gabe@mnrva.dev>
Date: Thu, 20 Apr 2023 22:38:39 -0400
Subject: [PATCH] Merge Student Data

---
 merge-csv.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 52 insertions(+), 4 deletions(-)

diff --git a/merge-csv.py b/merge-csv.py
index f29cad0..2859454 100644
--- a/merge-csv.py
+++ b/merge-csv.py
@@ -6,6 +6,7 @@ import numpy as np
 import datetime
 import itertools as it
 import argparse
+import re
 import pprint
 
 
@@ -23,6 +24,8 @@ def get_date():
 
 
 def cap_permutations(s):
+    if len(s) > 15:
+        return [s]
     lu_sequence = ((c.lower(), c.upper()) for c in s)
     return [''.join(x) for x in it.product(*lu_sequence)]
 
@@ -36,8 +39,13 @@ def combine_rows(df, col, possibilities):
     for p in possibilities:
         allp += cap_permutations(p)
     # also have to remove the final column from the possibilities
+    safety = 0
     while col in allp:
         allp.remove(col)
+        safety += 1
+        if safety > 100:
+            print(f'Infinite loop detected, shutting down.')
+            exit(1)
     # list to store replaced columns
     drops = []
     # for every column possibility that does exist...
@@ -56,11 +64,22 @@ def combine_rows(df, col, possibilities):
 
 def do_merge_student(cwd, mwd):
     # identify and merge student files
+    print('---Merging Student Data---')
     all_files = glob.glob(os.path.join(cwd, "*student*.csv"))
-    print(all_files)
-    df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
+    print(f'Found {len(all_files)} CSV files')
+    print('Merging...')
+    files = [pd.read_csv(f) for f in all_files]
+    lines = 0
+    for f in files:
+        lines += f.shape[0]
+    df = pd.concat(files, ignore_index=True)
+    print('Repairing rows...')
+    df = repair_student_rows(df)
+    if df.shape[0] != lines:
+        print(f'Warning! Line count mismatch: {lines} expected, but got {df.shape[0]}')
     date = get_date()
     df.to_csv(os.path.join(mwd, f'{date}-student-data-merged.csv'))
+    print('Student data merged successfully!')
 
 
 def do_merge_teacher(cwd, mwd):
@@ -90,6 +109,33 @@ def repair_teacher_rows(df):
     return df
 
 
+def repair_student_rows(df):
+    df = combine_rows(df, 'Recorded Date', ['recorded date', 'recordeddate'])
+    df = combine_rows(df, 'Response ID', ['Responseid', 'Response id'])
+    df = combine_rows(df, 'DeseId', ['deseid', 'dese id', 'school'])
+    df = combine_rows(df, 'Grade', ['grade', 'What grade are you in?'])
+    df = combine_rows(df, 'Gender', ['gender', 'Gender - self report', 'What is your gender?', 'What is your gender? - Selected Choice'])
+    df = combine_rows(df, 'Race', ['Race- self report', 'race', 'Race - self report'])
+    print('Combining Question Variants...')
+    df = combine_variants(df)
+    return df
+
+
+def combine_variants(df):
+    drops = []
+    for col in df:
+        x = re.search(r's-[a-z]{4}-q[0-9]-1', col)
+        if x is not None:
+            # get non variant version
+            nonvar = col[:-2]
+            # combine into non variant
+            df[nonvar] = df[nonvar].replace(r'^\s*$', np.nan, regex=True).fillna(df[col])
+            # and add it to the drop list
+            drops.append(col)
+    df = df.drop(columns=drops)
+    return df
+
+
 if __name__ == '__main__':
     # load environment vars
     load_dotenv()
@@ -117,5 +163,7 @@ if __name__ == '__main__':
     c, m = prep_dir(args.folder)
     if args.teacher:
         do_merge_teacher(c, m)
-    # if args.student:
-        # do_merge_student(c, m)
+    if args.student:
+        do_merge_student(c, m)
+
+# TODO: Regex match cols with title s-****-q#-1 and merge with col s-****-q#
\ No newline at end of file