column merging fixed

3 years ago · 676cb9d931
parent 8f8c14446e
commit 676cb9d931
2 changed files with 89 additions and 60 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,7 @@
-.env
+.env
-.idea
+.idea
-test-csv
+test-*
-ecp-csv
+ecp-csv
-test-csv-large
+test-csv-large
-merged
+merged
 bfg.jar
--- a/merge-csv.py
+++ b/merge-csv.py
@ -9,6 +9,38 @@ import re
 from urllib.parse import urlparse
 import pysftp
 # TODO make sure these are not case sensitive
 # all of the columns we want to extract from the csv file
 # excluding the question ids (they are found using regex)
 final_columns_student = {
    'Start Date': ['startdate', 'start date'],
    'End Date': ['enddate', 'end date'],
    'Status': ['status'],
    'Ip Address': ['ip address', 'ipaddress'],
    'Progress': ['progress'],
    'Duration': ['duration', 'duration..in.seconds', 'duration (in seconds)'],
    'District': ['district', 'please select your school district.'],
    'LASID': ['lasid', 'Please enter your Locally Assigned Student ID Number (LASID, or student lunch number).'],
    'Grade': ['grade', 'what grade are you in?'],
    'Gender': ['gender', 'what is your gender?', 'what is your gender? - selected choice'],
    'Race': ['race'],
    'Recorded Date': ['recorded date', 'recordeddate'],
    'Response Id': ['responseid', 'response id'],
    'Dese Id': ['deseid', 'dese id', 'school'],
 }
 final_columns_teacher = {
    'Start Date': ['startdate', 'start date'],
    'End Date': ['enddate', 'end date'],
    'Status': ['status'],
    'Ip Address': ['ip address', 'ipaddress'],
    'Progress': ['progress'],
    'Duration': ['duration', 'duration..in.seconds', 'duration (in seconds)'],
    'District': ['district', 'please select your school district.'],
    'Recorded Date': ['recorded date', 'recordeddate'],
    'Response Id': ['responseid', 'response id'],
    'Dese Id': ['deseid', 'dese id', 'school'],
 }
 class Sftp:
    def __init__(self, hostname, username, password, cnopts, port=22):
@ -97,28 +129,25 @@ class Sftp:
            raise Exception(err)
 # prepare csv and merged csv directories
 def prep_dir(folder=''):
    # prepare directories
    cwd = os.path.join(os.getcwd(), folder)
    mwd = os.path.join(cwd, 'merged')
    if not os.path.exists(mwd):
        if args.verbose: print(f'Creating directory {mwd}')
        os.mkdir(mwd)
    if args.verbose: print('Source data directory: ' + cwd)
    if args.verbose: print('Merged data directory: ' + mwd)
    return cwd, mwd
 # get current date in Month-XX-YYYY format
 def get_date():
    return datetime.date.today().strftime("%B-%d-%Y")
 # UNUSED
 # def cap_permutations(s):
 #     if len(s) > 15:
 #         return [s]
 #     lu_sequence = ((c.lower(), c.upper()) for c in s)
 #     return [''.join(x) for x in it.product(*lu_sequence)]
 # in dataframe df, merges any column in possibilities into the final column col
 def combine_cols(df, col, possibilities):
    # if final column doesn't exist, create it
    if col not in df.columns:
@ -143,38 +172,12 @@ def combine_cols(df, col, possibilities):
    return df
-def clean_cols(df):
+# removes unused columns from student data
-    keep = [
+def clean_cols_student(df):
-        'StartDate',
+    keep = list(final_columns_student.keys())
        'EndDate',
        'Start Date',
        'End Date',
        'Status',
        'Response Type',
        'IpAddress',
        'Ip Address'
        'Progress',
        'Duration',
        'Please enter your Locally Assigned Student ID Number (LASID, or student lunch number).',
        'Finished',
        'District',
        'LASID',
        'Recorded Date',
        'RecordedDate',
        'Grade',
        'Gender',
        'Race',
        'Response Id',
        'ResponseId',
        'DeseId',
        'Dese Id',
        'School',
        'District',
        'Please select your school district.',
    ]
    keep = list(map(str.lower, keep))
    drops = []
-    question_pattern = re.compile("^[s,t]-[a-zA-Z]{4}-q[0-9][0-9]?$")
+    question_pattern = re.compile("^s-[a-zA-Z]{4}-q[0-9][0-9]?$")
    for col in df.columns:
        if col.lower() not in keep and not bool(question_pattern.match(col)):
            drops.append(col)
@ -183,7 +186,21 @@ def clean_cols(df):
    return df
 # removes unused columns from teacher data
 def clean_cols_teacher(df):
    keep = list(final_columns_teacher.keys())
    keep = list(map(str.lower, keep))
    drops = []
    question_pattern = re.compile("^t-[a-zA-Z]{4}-q[0-9][0-9]?$")
    for col in df.columns:
        if col.lower() not in keep and not bool(question_pattern.match(col)):
            drops.append(col)
    df = df.drop(columns=drops)
    if args.verbose: print(f'Dropped columns: {drops}')
    return df
 # performs all merging operations for student data
 def do_merge_student(cwd, mwd):
    # identify and merge student files
    if not args.quiet: print('---Merging Student Data---')
@ -194,16 +211,22 @@ def do_merge_student(cwd, mwd):
        return
    if not args.quiet: print('Merging...')
    files = [pd.read_csv(f, low_memory=False) for f in all_files]
    # count lines in read csv files
    lines = 0
    for fi in files:
        lines += fi.shape[0]
    # combine csv files
    df = pd.concat(files, axis=0)
    # combine related columns
    if not args.quiet: print('Repairing rows...')
-    df = repair_student_rows(df)
+    df = repair_student_columns(df)
    # clean out unnecessary columns
    if not args.quiet: print('Cleaning out columns...')
-    df = clean_cols(df)
+    df = clean_cols_student(df)
    # ensure line count matches what is expected
    if df.shape[0] != lines:
        print(f'Warning: Line count mismatch: {lines} expected, but got {df.shape[0]}')
    # save merged file
    date = get_date()
    if args.project:
        proj = '-' + args.project
@ -215,6 +238,7 @@ def do_merge_student(cwd, mwd):
    return fn
 # performs all merging operations for teacher data
 def do_merge_teacher(cwd, mwd):
    # identify and merge teacher files
    if not args.quiet: print('---Merging Teacher Data---')
@ -225,16 +249,22 @@ def do_merge_teacher(cwd, mwd):
        return
    if not args.quiet: print('Merging...')
    files = [pd.read_csv(f, low_memory=False) for f in all_files]
    # count lines in read csv files
    lines = 0
    for f in files:
        lines += f.shape[0]
    # combine csv files
    df = pd.concat(files, axis=0)
-    if not args.quiet: print('Repairing rows...')
+    # combine related columns
-    df = repair_teacher_rows(df)
+    if not args.quiet: print('Repairing columns...')
    df = repair_teacher_columns(df)
    # clean out unnecessary columns
    if not args.quiet: print('Cleaning out columns...')
-    df = clean_cols(df)
+    df = clean_cols_teacher(df)
    # ensure line count matches what is expected
    if df.shape[0] != lines:
        print(f'Warning: Line count mismatch: {lines} expected, but got {df.shape[0]}')
    # save merged file
    date = get_date()
    if args.project:
        proj = '-' + args.project
@ -246,29 +276,28 @@ def do_merge_teacher(cwd, mwd):
    return fn
-def repair_teacher_rows(df):
+# merges teacher columns that may have mismatched names
-    df = combine_cols(df, 'Recorded Date', ['recorded date', 'recordeddate'])
+def repair_teacher_columns(df):
-    df = combine_cols(df, 'Response ID', ['responseid', 'response id'])
+    for col in final_columns_teacher:
-    df = combine_cols(df, 'DeseId', ['deseid', 'dese id', 'school'])
+        df = combine_cols(df, col, final_columns_teacher[col])
    return df
-def repair_student_rows(df):
+# merges student columns that may have mismatched names, 
-    df = combine_cols(df, 'Recorded Date', ['recorded date', 'recordeddate'])
+# and combines question variants
-    df = combine_cols(df, 'Response ID', ['responseid', 'response id'])
+def repair_student_columns(df):
-    df = combine_cols(df, 'DeseId', ['deseid', 'dese id', 'school'])
+    for col in final_columns_student:
-    df = combine_cols(df, 'Grade', ['grade', 'what grade are you in?'])
+        df = combine_cols(df, col, final_columns_student[col])
    df = combine_cols(df, 'Gender', ['gender', 'what is your gender?', 'what is your gender? - selected choice'])
    df = combine_cols(df, 'Race', ['race'])
    if not args.quiet: print('Combining Question Variants...')
    df = combine_variants(df)
    return df
 # combines question variants into non-variant columns
 def combine_variants(df):
    drops = []
    for col in df:
-        x = re.search(r's-[a-z]{4}-q[0-9][0-9]?-1', col)
+        x = re.search(r'^s-[a-z]{4}-q[0-9][0-9]?-1$', col)
        if x is not None:
            # get non variant version
            nonvar = col[:-2]