diff --git a/merge.py b/merge.py index 7f70390..34e5bb0 100644 --- a/merge.py +++ b/merge.py @@ -7,6 +7,8 @@ import itertools as it import argparse import re from urllib.parse import urlparse +from dateutil.parser import parse +from dateutil.relativedelta import relativedelta import pysftp #NOTE for now, each of the arrays should be all lowercase @@ -228,6 +230,9 @@ def do_merge_student(cwd, mwd): # clean out unnecessary columns if not argQuiet: print('Cleaning out columns...') df = clean_cols_student(df) + # add academic year column + if not argQuiet: print('Adding \'Academic Year\' column...') + df = add_academic_year(df) # ensure line count matches what is expected if df.shape[0] != lines: print(f'Warning: Line count mismatch: {lines} expected, but got {df.shape[0]}') @@ -266,6 +271,9 @@ def do_merge_teacher(cwd, mwd): # clean out unnecessary columns if not argQuiet: print('Cleaning out columns...') df = clean_cols_teacher(df) + # add academic year column + if not argQuiet: print('Adding \'Academic Year\' column...') + df = add_academic_year(df) # ensure line count matches what is expected if df.shape[0] != lines: print(f'Warning: Line count mismatch: {lines} expected, but got {df.shape[0]}') @@ -313,6 +321,35 @@ def combine_variants(df): df = df.drop(columns=drops) return df +# take the dates in 'Recorded Date' and use them to add +# a column for academic year +# note: must be used after the columns are merged because this only +# looks for the column 'Recorded Date' +def add_academic_year(df): + academic_year = [] + recorded_date = df['Recorded Date'].tolist() + for datestr in recorded_date: + academic_year.append(date_str_to_academic_year(datestr)) + df['Academic Year'] = academic_year + # probably unnecessary to return df here, but this is the convention so far + return df + + +def date_str_to_academic_year(str): + # get date from string + try: + date = parse(str).date() + except TypeError: + # I would like this to only print once if the merged csv will have Undefined, but whatever + print('WARN: Found non-date value in \'Recorded Date\' column, \'Academic Year\' will contain \'Undefined\' for some rows') + return 'Undefined' + # I wanted to use dates to calculate the nextyear and lastyear values, but LEAP YEARS !!!! + if date.month < 7: # spring semester + lastyear = date.year-1 + return f'{lastyear}-{date.strftime("%y")}' + else: # fall semester + nextyear = date.year+1 - 2000 + return f'{date.year}-{nextyear}' if __name__ == '__main__': diff --git a/test.py b/test.py index 5feefcc..1dabd83 100644 --- a/test.py +++ b/test.py @@ -218,6 +218,36 @@ class TestMergeCSV(unittest.TestCase): }) td = merge.repair_teacher_columns(td) self.assertTrue(td.equals(expected), td) + + def test_date_str_to_academic_year(self): + # test for spring semester + datestr = "3/16/2023 13:23" + academic_year = merge.date_str_to_academic_year(datestr) + self.assertEqual(academic_year, '2022-23') + # test for fall semester + datestr = "9/16/2021 13:23" + academic_year = merge.date_str_to_academic_year(datestr) + self.assertEqual(academic_year, '2021-22') + + def test_add_academic_year(self): + td = pd.DataFrame({ + 'Recorded Date': [ + '9/16/2021 13:23', + '3/16/2023 13:23' + ] + }) + expected = pd.DataFrame({ + 'Recorded Date': [ + '9/16/2021 13:23', + '3/16/2023 13:23' + ], + 'Academic Year': [ + '2021-22', + '2022-23' + ] + }) + td = merge.add_academic_year(td) + self.assertTrue(td.equals(expected), td) if __name__ == '__main__': unittest.main() \ No newline at end of file