mirror of
https://github.com/edcommonwealth/merge-csv.git
synced 2026-03-07 21:48:13 -08:00
Merge Student Data
This commit is contained in:
parent
7727b77cda
commit
2ea21c767f
1 changed files with 52 additions and 4 deletions
56
merge-csv.py
56
merge-csv.py
|
|
@ -6,6 +6,7 @@ import numpy as np
|
|||
import datetime
|
||||
import itertools as it
|
||||
import argparse
|
||||
import re
|
||||
import pprint
|
||||
|
||||
|
||||
|
|
@ -23,6 +24,8 @@ def get_date():
|
|||
|
||||
|
||||
def cap_permutations(s):
|
||||
if len(s) > 15:
|
||||
return [s]
|
||||
lu_sequence = ((c.lower(), c.upper()) for c in s)
|
||||
return [''.join(x) for x in it.product(*lu_sequence)]
|
||||
|
||||
|
|
@ -36,8 +39,13 @@ def combine_rows(df, col, possibilities):
|
|||
for p in possibilities:
|
||||
allp += cap_permutations(p)
|
||||
# also have to remove the final column from the possibilities
|
||||
safety = 0
|
||||
while col in allp:
|
||||
allp.remove(col)
|
||||
safety += 1
|
||||
if safety > 100:
|
||||
print(f'Infinite loop detected, shutting down.')
|
||||
exit(1)
|
||||
# list to store replaced columns
|
||||
drops = []
|
||||
# for every column possibility that does exist...
|
||||
|
|
@ -56,11 +64,22 @@ def combine_rows(df, col, possibilities):
|
|||
|
||||
def do_merge_student(cwd, mwd):
|
||||
# identify and merge student files
|
||||
print('---Merging Student Data---')
|
||||
all_files = glob.glob(os.path.join(cwd, "*student*.csv"))
|
||||
print(all_files)
|
||||
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
|
||||
print(f'Found {len(all_files)} CSV files')
|
||||
print('Merging...')
|
||||
files = [pd.read_csv(f) for f in all_files]
|
||||
lines = 0
|
||||
for f in files:
|
||||
lines += f.shape[0]
|
||||
df = pd.concat(files, ignore_index=True)
|
||||
print('Repairing rows...')
|
||||
df = repair_student_rows(df)
|
||||
if df.shape[0] != lines:
|
||||
print(f'Warning! Line count mismatch: {lines} expected, but got {df.shape[0]}')
|
||||
date = get_date()
|
||||
df.to_csv(os.path.join(mwd, f'{date}-student-data-merged.csv'))
|
||||
print('Student data merged successfully!')
|
||||
|
||||
|
||||
def do_merge_teacher(cwd, mwd):
|
||||
|
|
@ -90,6 +109,33 @@ def repair_teacher_rows(df):
|
|||
return df
|
||||
|
||||
|
||||
def repair_student_rows(df):
|
||||
df = combine_rows(df, 'Recorded Date', ['recorded date', 'recordeddate'])
|
||||
df = combine_rows(df, 'Response ID', ['Responseid', 'Response id'])
|
||||
df = combine_rows(df, 'DeseId', ['deseid', 'dese id', 'school'])
|
||||
df = combine_rows(df, 'Grade', ['grade', 'What grade are you in?'])
|
||||
df = combine_rows(df, 'Gender', ['gender', 'Gender - self report', 'What is your gender?', 'What is your gender? - Selected Choice'])
|
||||
df = combine_rows(df, 'Race', ['Race- self report', 'race', 'Race - self report'])
|
||||
print('Combining Question Variants...')
|
||||
df = combine_variants(df)
|
||||
return df
|
||||
|
||||
|
||||
def combine_variants(df):
|
||||
drops = []
|
||||
for col in df:
|
||||
x = re.search(r's-[a-z]{4}-q[0-9]-1', col)
|
||||
if x is not None:
|
||||
# get non variant version
|
||||
nonvar = col[:-2]
|
||||
# combine into non variant
|
||||
df[nonvar] = df[nonvar].replace(r'^\s*$', np.nan, regex=True).fillna(df[col])
|
||||
# and add it to the drop list
|
||||
drops.append(col)
|
||||
df = df.drop(columns=drops)
|
||||
return df
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# load environment vars
|
||||
load_dotenv()
|
||||
|
|
@ -117,5 +163,7 @@ if __name__ == '__main__':
|
|||
c, m = prep_dir(args.folder)
|
||||
if args.teacher:
|
||||
do_merge_teacher(c, m)
|
||||
# if args.student:
|
||||
# do_merge_student(c, m)
|
||||
if args.student:
|
||||
do_merge_student(c, m)
|
||||
|
||||
# TODO: Regex match cols with title s-****-q#-1 and merge with col s-****-q#
|
||||
Loading…
Add table
Add a link
Reference in a new issue