column merging fixed

main
Gabe Farrell 3 years ago
parent 8f8c14446e
commit 676cb9d931

12
.gitignore vendored

@ -1,7 +1,7 @@
.env .env
.idea .idea
test-csv test-*
ecp-csv ecp-csv
test-csv-large test-csv-large
merged merged
bfg.jar bfg.jar

@ -9,6 +9,38 @@ import re
from urllib.parse import urlparse from urllib.parse import urlparse
import pysftp import pysftp
# TODO make sure these are not case sensitive
# all of the columns we want to extract from the csv file
# excluding the question ids (they are found using regex)
final_columns_student = {
'Start Date': ['startdate', 'start date'],
'End Date': ['enddate', 'end date'],
'Status': ['status'],
'Ip Address': ['ip address', 'ipaddress'],
'Progress': ['progress'],
'Duration': ['duration', 'duration..in.seconds', 'duration (in seconds)'],
'District': ['district', 'please select your school district.'],
'LASID': ['lasid', 'Please enter your Locally Assigned Student ID Number (LASID, or student lunch number).'],
'Grade': ['grade', 'what grade are you in?'],
'Gender': ['gender', 'what is your gender?', 'what is your gender? - selected choice'],
'Race': ['race'],
'Recorded Date': ['recorded date', 'recordeddate'],
'Response Id': ['responseid', 'response id'],
'Dese Id': ['deseid', 'dese id', 'school'],
}
final_columns_teacher = {
'Start Date': ['startdate', 'start date'],
'End Date': ['enddate', 'end date'],
'Status': ['status'],
'Ip Address': ['ip address', 'ipaddress'],
'Progress': ['progress'],
'Duration': ['duration', 'duration..in.seconds', 'duration (in seconds)'],
'District': ['district', 'please select your school district.'],
'Recorded Date': ['recorded date', 'recordeddate'],
'Response Id': ['responseid', 'response id'],
'Dese Id': ['deseid', 'dese id', 'school'],
}
class Sftp: class Sftp:
def __init__(self, hostname, username, password, cnopts, port=22): def __init__(self, hostname, username, password, cnopts, port=22):
@ -97,28 +129,25 @@ class Sftp:
raise Exception(err) raise Exception(err)
# prepare csv and merged csv directories
def prep_dir(folder=''): def prep_dir(folder=''):
# prepare directories # prepare directories
cwd = os.path.join(os.getcwd(), folder) cwd = os.path.join(os.getcwd(), folder)
mwd = os.path.join(cwd, 'merged') mwd = os.path.join(cwd, 'merged')
if not os.path.exists(mwd): if not os.path.exists(mwd):
if args.verbose: print(f'Creating directory {mwd}')
os.mkdir(mwd) os.mkdir(mwd)
if args.verbose: print('Source data directory: ' + cwd) if args.verbose: print('Source data directory: ' + cwd)
if args.verbose: print('Merged data directory: ' + mwd) if args.verbose: print('Merged data directory: ' + mwd)
return cwd, mwd return cwd, mwd
# get current date in Month-XX-YYYY format
def get_date(): def get_date():
return datetime.date.today().strftime("%B-%d-%Y") return datetime.date.today().strftime("%B-%d-%Y")
# UNUSED
# def cap_permutations(s):
# if len(s) > 15:
# return [s]
# lu_sequence = ((c.lower(), c.upper()) for c in s)
# return [''.join(x) for x in it.product(*lu_sequence)]
# in dataframe df, merges any column in possibilities into the final column col
def combine_cols(df, col, possibilities): def combine_cols(df, col, possibilities):
# if final column doesn't exist, create it # if final column doesn't exist, create it
if col not in df.columns: if col not in df.columns:
@ -143,38 +172,12 @@ def combine_cols(df, col, possibilities):
return df return df
def clean_cols(df): # removes unused columns from student data
keep = [ def clean_cols_student(df):
'StartDate', keep = list(final_columns_student.keys())
'EndDate',
'Start Date',
'End Date',
'Status',
'Response Type',
'IpAddress',
'Ip Address'
'Progress',
'Duration',
'Please enter your Locally Assigned Student ID Number (LASID, or student lunch number).',
'Finished',
'District',
'LASID',
'Recorded Date',
'RecordedDate',
'Grade',
'Gender',
'Race',
'Response Id',
'ResponseId',
'DeseId',
'Dese Id',
'School',
'District',
'Please select your school district.',
]
keep = list(map(str.lower, keep)) keep = list(map(str.lower, keep))
drops = [] drops = []
question_pattern = re.compile("^[s,t]-[a-zA-Z]{4}-q[0-9][0-9]?$") question_pattern = re.compile("^s-[a-zA-Z]{4}-q[0-9][0-9]?$")
for col in df.columns: for col in df.columns:
if col.lower() not in keep and not bool(question_pattern.match(col)): if col.lower() not in keep and not bool(question_pattern.match(col)):
drops.append(col) drops.append(col)
@ -183,7 +186,21 @@ def clean_cols(df):
return df return df
# removes unused columns from teacher data
def clean_cols_teacher(df):
keep = list(final_columns_teacher.keys())
keep = list(map(str.lower, keep))
drops = []
question_pattern = re.compile("^t-[a-zA-Z]{4}-q[0-9][0-9]?$")
for col in df.columns:
if col.lower() not in keep and not bool(question_pattern.match(col)):
drops.append(col)
df = df.drop(columns=drops)
if args.verbose: print(f'Dropped columns: {drops}')
return df
# performs all merging operations for student data
def do_merge_student(cwd, mwd): def do_merge_student(cwd, mwd):
# identify and merge student files # identify and merge student files
if not args.quiet: print('---Merging Student Data---') if not args.quiet: print('---Merging Student Data---')
@ -194,16 +211,22 @@ def do_merge_student(cwd, mwd):
return return
if not args.quiet: print('Merging...') if not args.quiet: print('Merging...')
files = [pd.read_csv(f, low_memory=False) for f in all_files] files = [pd.read_csv(f, low_memory=False) for f in all_files]
# count lines in read csv files
lines = 0 lines = 0
for fi in files: for fi in files:
lines += fi.shape[0] lines += fi.shape[0]
# combine csv files
df = pd.concat(files, axis=0) df = pd.concat(files, axis=0)
# combine related columns
if not args.quiet: print('Repairing rows...') if not args.quiet: print('Repairing rows...')
df = repair_student_rows(df) df = repair_student_columns(df)
# clean out unnecessary columns
if not args.quiet: print('Cleaning out columns...') if not args.quiet: print('Cleaning out columns...')
df = clean_cols(df) df = clean_cols_student(df)
# ensure line count matches what is expected
if df.shape[0] != lines: if df.shape[0] != lines:
print(f'Warning: Line count mismatch: {lines} expected, but got {df.shape[0]}') print(f'Warning: Line count mismatch: {lines} expected, but got {df.shape[0]}')
# save merged file
date = get_date() date = get_date()
if args.project: if args.project:
proj = '-' + args.project proj = '-' + args.project
@ -215,6 +238,7 @@ def do_merge_student(cwd, mwd):
return fn return fn
# performs all merging operations for teacher data
def do_merge_teacher(cwd, mwd): def do_merge_teacher(cwd, mwd):
# identify and merge teacher files # identify and merge teacher files
if not args.quiet: print('---Merging Teacher Data---') if not args.quiet: print('---Merging Teacher Data---')
@ -225,16 +249,22 @@ def do_merge_teacher(cwd, mwd):
return return
if not args.quiet: print('Merging...') if not args.quiet: print('Merging...')
files = [pd.read_csv(f, low_memory=False) for f in all_files] files = [pd.read_csv(f, low_memory=False) for f in all_files]
# count lines in read csv files
lines = 0 lines = 0
for f in files: for f in files:
lines += f.shape[0] lines += f.shape[0]
# combine csv files
df = pd.concat(files, axis=0) df = pd.concat(files, axis=0)
if not args.quiet: print('Repairing rows...') # combine related columns
df = repair_teacher_rows(df) if not args.quiet: print('Repairing columns...')
df = repair_teacher_columns(df)
# clean out unnecessary columns
if not args.quiet: print('Cleaning out columns...') if not args.quiet: print('Cleaning out columns...')
df = clean_cols(df) df = clean_cols_teacher(df)
# ensure line count matches what is expected
if df.shape[0] != lines: if df.shape[0] != lines:
print(f'Warning: Line count mismatch: {lines} expected, but got {df.shape[0]}') print(f'Warning: Line count mismatch: {lines} expected, but got {df.shape[0]}')
# save merged file
date = get_date() date = get_date()
if args.project: if args.project:
proj = '-' + args.project proj = '-' + args.project
@ -246,29 +276,28 @@ def do_merge_teacher(cwd, mwd):
return fn return fn
def repair_teacher_rows(df): # merges teacher columns that may have mismatched names
df = combine_cols(df, 'Recorded Date', ['recorded date', 'recordeddate']) def repair_teacher_columns(df):
df = combine_cols(df, 'Response ID', ['responseid', 'response id']) for col in final_columns_teacher:
df = combine_cols(df, 'DeseId', ['deseid', 'dese id', 'school']) df = combine_cols(df, col, final_columns_teacher[col])
return df return df
def repair_student_rows(df): # merges student columns that may have mismatched names,
df = combine_cols(df, 'Recorded Date', ['recorded date', 'recordeddate']) # and combines question variants
df = combine_cols(df, 'Response ID', ['responseid', 'response id']) def repair_student_columns(df):
df = combine_cols(df, 'DeseId', ['deseid', 'dese id', 'school']) for col in final_columns_student:
df = combine_cols(df, 'Grade', ['grade', 'what grade are you in?']) df = combine_cols(df, col, final_columns_student[col])
df = combine_cols(df, 'Gender', ['gender', 'what is your gender?', 'what is your gender? - selected choice'])
df = combine_cols(df, 'Race', ['race'])
if not args.quiet: print('Combining Question Variants...') if not args.quiet: print('Combining Question Variants...')
df = combine_variants(df) df = combine_variants(df)
return df return df
# combines question variants into non-variant columns
def combine_variants(df): def combine_variants(df):
drops = [] drops = []
for col in df: for col in df:
x = re.search(r's-[a-z]{4}-q[0-9][0-9]?-1', col) x = re.search(r'^s-[a-z]{4}-q[0-9][0-9]?-1$', col)
if x is not None: if x is not None:
# get non variant version # get non variant version
nonvar = col[:-2] nonvar = col[:-2]

Loading…
Cancel
Save