|
|
|
@ -9,6 +9,38 @@ import re
|
|
|
|
from urllib.parse import urlparse
|
|
|
|
from urllib.parse import urlparse
|
|
|
|
import pysftp
|
|
|
|
import pysftp
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# TODO make sure these are not case sensitive
|
|
|
|
|
|
|
|
# all of the columns we want to extract from the csv file
|
|
|
|
|
|
|
|
# excluding the question ids (they are found using regex)
|
|
|
|
|
|
|
|
final_columns_student = {
|
|
|
|
|
|
|
|
'Start Date': ['startdate', 'start date'],
|
|
|
|
|
|
|
|
'End Date': ['enddate', 'end date'],
|
|
|
|
|
|
|
|
'Status': ['status'],
|
|
|
|
|
|
|
|
'Ip Address': ['ip address', 'ipaddress'],
|
|
|
|
|
|
|
|
'Progress': ['progress'],
|
|
|
|
|
|
|
|
'Duration': ['duration', 'duration..in.seconds', 'duration (in seconds)'],
|
|
|
|
|
|
|
|
'District': ['district', 'please select your school district.'],
|
|
|
|
|
|
|
|
'LASID': ['lasid', 'Please enter your Locally Assigned Student ID Number (LASID, or student lunch number).'],
|
|
|
|
|
|
|
|
'Grade': ['grade', 'what grade are you in?'],
|
|
|
|
|
|
|
|
'Gender': ['gender', 'what is your gender?', 'what is your gender? - selected choice'],
|
|
|
|
|
|
|
|
'Race': ['race'],
|
|
|
|
|
|
|
|
'Recorded Date': ['recorded date', 'recordeddate'],
|
|
|
|
|
|
|
|
'Response Id': ['responseid', 'response id'],
|
|
|
|
|
|
|
|
'Dese Id': ['deseid', 'dese id', 'school'],
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
final_columns_teacher = {
|
|
|
|
|
|
|
|
'Start Date': ['startdate', 'start date'],
|
|
|
|
|
|
|
|
'End Date': ['enddate', 'end date'],
|
|
|
|
|
|
|
|
'Status': ['status'],
|
|
|
|
|
|
|
|
'Ip Address': ['ip address', 'ipaddress'],
|
|
|
|
|
|
|
|
'Progress': ['progress'],
|
|
|
|
|
|
|
|
'Duration': ['duration', 'duration..in.seconds', 'duration (in seconds)'],
|
|
|
|
|
|
|
|
'District': ['district', 'please select your school district.'],
|
|
|
|
|
|
|
|
'Recorded Date': ['recorded date', 'recordeddate'],
|
|
|
|
|
|
|
|
'Response Id': ['responseid', 'response id'],
|
|
|
|
|
|
|
|
'Dese Id': ['deseid', 'dese id', 'school'],
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
class Sftp:
|
|
|
|
class Sftp:
|
|
|
|
def __init__(self, hostname, username, password, cnopts, port=22):
|
|
|
|
def __init__(self, hostname, username, password, cnopts, port=22):
|
|
|
|
@ -97,28 +129,25 @@ class Sftp:
|
|
|
|
raise Exception(err)
|
|
|
|
raise Exception(err)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# prepare csv and merged csv directories
|
|
|
|
def prep_dir(folder=''):
|
|
|
|
def prep_dir(folder=''):
|
|
|
|
# prepare directories
|
|
|
|
# prepare directories
|
|
|
|
cwd = os.path.join(os.getcwd(), folder)
|
|
|
|
cwd = os.path.join(os.getcwd(), folder)
|
|
|
|
mwd = os.path.join(cwd, 'merged')
|
|
|
|
mwd = os.path.join(cwd, 'merged')
|
|
|
|
if not os.path.exists(mwd):
|
|
|
|
if not os.path.exists(mwd):
|
|
|
|
|
|
|
|
if args.verbose: print(f'Creating directory {mwd}')
|
|
|
|
os.mkdir(mwd)
|
|
|
|
os.mkdir(mwd)
|
|
|
|
if args.verbose: print('Source data directory: ' + cwd)
|
|
|
|
if args.verbose: print('Source data directory: ' + cwd)
|
|
|
|
if args.verbose: print('Merged data directory: ' + mwd)
|
|
|
|
if args.verbose: print('Merged data directory: ' + mwd)
|
|
|
|
return cwd, mwd
|
|
|
|
return cwd, mwd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# get current date in Month-XX-YYYY format
|
|
|
|
def get_date():
|
|
|
|
def get_date():
|
|
|
|
return datetime.date.today().strftime("%B-%d-%Y")
|
|
|
|
return datetime.date.today().strftime("%B-%d-%Y")
|
|
|
|
|
|
|
|
|
|
|
|
# UNUSED
|
|
|
|
|
|
|
|
# def cap_permutations(s):
|
|
|
|
|
|
|
|
# if len(s) > 15:
|
|
|
|
|
|
|
|
# return [s]
|
|
|
|
|
|
|
|
# lu_sequence = ((c.lower(), c.upper()) for c in s)
|
|
|
|
|
|
|
|
# return [''.join(x) for x in it.product(*lu_sequence)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# in dataframe df, merges any column in possibilities into the final column col
|
|
|
|
def combine_cols(df, col, possibilities):
|
|
|
|
def combine_cols(df, col, possibilities):
|
|
|
|
# if final column doesn't exist, create it
|
|
|
|
# if final column doesn't exist, create it
|
|
|
|
if col not in df.columns:
|
|
|
|
if col not in df.columns:
|
|
|
|
@ -143,38 +172,12 @@ def combine_cols(df, col, possibilities):
|
|
|
|
return df
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_cols(df):
|
|
|
|
# removes unused columns from student data
|
|
|
|
keep = [
|
|
|
|
def clean_cols_student(df):
|
|
|
|
'StartDate',
|
|
|
|
keep = list(final_columns_student.keys())
|
|
|
|
'EndDate',
|
|
|
|
|
|
|
|
'Start Date',
|
|
|
|
|
|
|
|
'End Date',
|
|
|
|
|
|
|
|
'Status',
|
|
|
|
|
|
|
|
'Response Type',
|
|
|
|
|
|
|
|
'IpAddress',
|
|
|
|
|
|
|
|
'Ip Address'
|
|
|
|
|
|
|
|
'Progress',
|
|
|
|
|
|
|
|
'Duration',
|
|
|
|
|
|
|
|
'Please enter your Locally Assigned Student ID Number (LASID, or student lunch number).',
|
|
|
|
|
|
|
|
'Finished',
|
|
|
|
|
|
|
|
'District',
|
|
|
|
|
|
|
|
'LASID',
|
|
|
|
|
|
|
|
'Recorded Date',
|
|
|
|
|
|
|
|
'RecordedDate',
|
|
|
|
|
|
|
|
'Grade',
|
|
|
|
|
|
|
|
'Gender',
|
|
|
|
|
|
|
|
'Race',
|
|
|
|
|
|
|
|
'Response Id',
|
|
|
|
|
|
|
|
'ResponseId',
|
|
|
|
|
|
|
|
'DeseId',
|
|
|
|
|
|
|
|
'Dese Id',
|
|
|
|
|
|
|
|
'School',
|
|
|
|
|
|
|
|
'District',
|
|
|
|
|
|
|
|
'Please select your school district.',
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
keep = list(map(str.lower, keep))
|
|
|
|
keep = list(map(str.lower, keep))
|
|
|
|
drops = []
|
|
|
|
drops = []
|
|
|
|
question_pattern = re.compile("^[s,t]-[a-zA-Z]{4}-q[0-9][0-9]?$")
|
|
|
|
question_pattern = re.compile("^s-[a-zA-Z]{4}-q[0-9][0-9]?$")
|
|
|
|
for col in df.columns:
|
|
|
|
for col in df.columns:
|
|
|
|
if col.lower() not in keep and not bool(question_pattern.match(col)):
|
|
|
|
if col.lower() not in keep and not bool(question_pattern.match(col)):
|
|
|
|
drops.append(col)
|
|
|
|
drops.append(col)
|
|
|
|
@ -183,7 +186,21 @@ def clean_cols(df):
|
|
|
|
return df
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# removes unused columns from teacher data
|
|
|
|
|
|
|
|
def clean_cols_teacher(df):
|
|
|
|
|
|
|
|
keep = list(final_columns_teacher.keys())
|
|
|
|
|
|
|
|
keep = list(map(str.lower, keep))
|
|
|
|
|
|
|
|
drops = []
|
|
|
|
|
|
|
|
question_pattern = re.compile("^t-[a-zA-Z]{4}-q[0-9][0-9]?$")
|
|
|
|
|
|
|
|
for col in df.columns:
|
|
|
|
|
|
|
|
if col.lower() not in keep and not bool(question_pattern.match(col)):
|
|
|
|
|
|
|
|
drops.append(col)
|
|
|
|
|
|
|
|
df = df.drop(columns=drops)
|
|
|
|
|
|
|
|
if args.verbose: print(f'Dropped columns: {drops}')
|
|
|
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# performs all merging operations for student data
|
|
|
|
def do_merge_student(cwd, mwd):
|
|
|
|
def do_merge_student(cwd, mwd):
|
|
|
|
# identify and merge student files
|
|
|
|
# identify and merge student files
|
|
|
|
if not args.quiet: print('---Merging Student Data---')
|
|
|
|
if not args.quiet: print('---Merging Student Data---')
|
|
|
|
@ -194,16 +211,22 @@ def do_merge_student(cwd, mwd):
|
|
|
|
return
|
|
|
|
return
|
|
|
|
if not args.quiet: print('Merging...')
|
|
|
|
if not args.quiet: print('Merging...')
|
|
|
|
files = [pd.read_csv(f, low_memory=False) for f in all_files]
|
|
|
|
files = [pd.read_csv(f, low_memory=False) for f in all_files]
|
|
|
|
|
|
|
|
# count lines in read csv files
|
|
|
|
lines = 0
|
|
|
|
lines = 0
|
|
|
|
for fi in files:
|
|
|
|
for fi in files:
|
|
|
|
lines += fi.shape[0]
|
|
|
|
lines += fi.shape[0]
|
|
|
|
|
|
|
|
# combine csv files
|
|
|
|
df = pd.concat(files, axis=0)
|
|
|
|
df = pd.concat(files, axis=0)
|
|
|
|
|
|
|
|
# combine related columns
|
|
|
|
if not args.quiet: print('Repairing rows...')
|
|
|
|
if not args.quiet: print('Repairing rows...')
|
|
|
|
df = repair_student_rows(df)
|
|
|
|
df = repair_student_columns(df)
|
|
|
|
|
|
|
|
# clean out unnecessary columns
|
|
|
|
if not args.quiet: print('Cleaning out columns...')
|
|
|
|
if not args.quiet: print('Cleaning out columns...')
|
|
|
|
df = clean_cols(df)
|
|
|
|
df = clean_cols_student(df)
|
|
|
|
|
|
|
|
# ensure line count matches what is expected
|
|
|
|
if df.shape[0] != lines:
|
|
|
|
if df.shape[0] != lines:
|
|
|
|
print(f'Warning: Line count mismatch: {lines} expected, but got {df.shape[0]}')
|
|
|
|
print(f'Warning: Line count mismatch: {lines} expected, but got {df.shape[0]}')
|
|
|
|
|
|
|
|
# save merged file
|
|
|
|
date = get_date()
|
|
|
|
date = get_date()
|
|
|
|
if args.project:
|
|
|
|
if args.project:
|
|
|
|
proj = '-' + args.project
|
|
|
|
proj = '-' + args.project
|
|
|
|
@ -215,6 +238,7 @@ def do_merge_student(cwd, mwd):
|
|
|
|
return fn
|
|
|
|
return fn
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# performs all merging operations for teacher data
|
|
|
|
def do_merge_teacher(cwd, mwd):
|
|
|
|
def do_merge_teacher(cwd, mwd):
|
|
|
|
# identify and merge teacher files
|
|
|
|
# identify and merge teacher files
|
|
|
|
if not args.quiet: print('---Merging Teacher Data---')
|
|
|
|
if not args.quiet: print('---Merging Teacher Data---')
|
|
|
|
@ -225,16 +249,22 @@ def do_merge_teacher(cwd, mwd):
|
|
|
|
return
|
|
|
|
return
|
|
|
|
if not args.quiet: print('Merging...')
|
|
|
|
if not args.quiet: print('Merging...')
|
|
|
|
files = [pd.read_csv(f, low_memory=False) for f in all_files]
|
|
|
|
files = [pd.read_csv(f, low_memory=False) for f in all_files]
|
|
|
|
|
|
|
|
# count lines in read csv files
|
|
|
|
lines = 0
|
|
|
|
lines = 0
|
|
|
|
for f in files:
|
|
|
|
for f in files:
|
|
|
|
lines += f.shape[0]
|
|
|
|
lines += f.shape[0]
|
|
|
|
|
|
|
|
# combine csv files
|
|
|
|
df = pd.concat(files, axis=0)
|
|
|
|
df = pd.concat(files, axis=0)
|
|
|
|
if not args.quiet: print('Repairing rows...')
|
|
|
|
# combine related columns
|
|
|
|
df = repair_teacher_rows(df)
|
|
|
|
if not args.quiet: print('Repairing columns...')
|
|
|
|
|
|
|
|
df = repair_teacher_columns(df)
|
|
|
|
|
|
|
|
# clean out unnecessary columns
|
|
|
|
if not args.quiet: print('Cleaning out columns...')
|
|
|
|
if not args.quiet: print('Cleaning out columns...')
|
|
|
|
df = clean_cols(df)
|
|
|
|
df = clean_cols_teacher(df)
|
|
|
|
|
|
|
|
# ensure line count matches what is expected
|
|
|
|
if df.shape[0] != lines:
|
|
|
|
if df.shape[0] != lines:
|
|
|
|
print(f'Warning: Line count mismatch: {lines} expected, but got {df.shape[0]}')
|
|
|
|
print(f'Warning: Line count mismatch: {lines} expected, but got {df.shape[0]}')
|
|
|
|
|
|
|
|
# save merged file
|
|
|
|
date = get_date()
|
|
|
|
date = get_date()
|
|
|
|
if args.project:
|
|
|
|
if args.project:
|
|
|
|
proj = '-' + args.project
|
|
|
|
proj = '-' + args.project
|
|
|
|
@ -246,29 +276,28 @@ def do_merge_teacher(cwd, mwd):
|
|
|
|
return fn
|
|
|
|
return fn
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def repair_teacher_rows(df):
|
|
|
|
# merges teacher columns that may have mismatched names
|
|
|
|
df = combine_cols(df, 'Recorded Date', ['recorded date', 'recordeddate'])
|
|
|
|
def repair_teacher_columns(df):
|
|
|
|
df = combine_cols(df, 'Response ID', ['responseid', 'response id'])
|
|
|
|
for col in final_columns_teacher:
|
|
|
|
df = combine_cols(df, 'DeseId', ['deseid', 'dese id', 'school'])
|
|
|
|
df = combine_cols(df, col, final_columns_teacher[col])
|
|
|
|
return df
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def repair_student_rows(df):
|
|
|
|
# merges student columns that may have mismatched names,
|
|
|
|
df = combine_cols(df, 'Recorded Date', ['recorded date', 'recordeddate'])
|
|
|
|
# and combines question variants
|
|
|
|
df = combine_cols(df, 'Response ID', ['responseid', 'response id'])
|
|
|
|
def repair_student_columns(df):
|
|
|
|
df = combine_cols(df, 'DeseId', ['deseid', 'dese id', 'school'])
|
|
|
|
for col in final_columns_student:
|
|
|
|
df = combine_cols(df, 'Grade', ['grade', 'what grade are you in?'])
|
|
|
|
df = combine_cols(df, col, final_columns_student[col])
|
|
|
|
df = combine_cols(df, 'Gender', ['gender', 'what is your gender?', 'what is your gender? - selected choice'])
|
|
|
|
|
|
|
|
df = combine_cols(df, 'Race', ['race'])
|
|
|
|
|
|
|
|
if not args.quiet: print('Combining Question Variants...')
|
|
|
|
if not args.quiet: print('Combining Question Variants...')
|
|
|
|
df = combine_variants(df)
|
|
|
|
df = combine_variants(df)
|
|
|
|
return df
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# combines question variants into non-variant columns
|
|
|
|
def combine_variants(df):
|
|
|
|
def combine_variants(df):
|
|
|
|
drops = []
|
|
|
|
drops = []
|
|
|
|
for col in df:
|
|
|
|
for col in df:
|
|
|
|
x = re.search(r's-[a-z]{4}-q[0-9][0-9]?-1', col)
|
|
|
|
x = re.search(r'^s-[a-z]{4}-q[0-9][0-9]?-1$', col)
|
|
|
|
if x is not None:
|
|
|
|
if x is not None:
|
|
|
|
# get non variant version
|
|
|
|
# get non variant version
|
|
|
|
nonvar = col[:-2]
|
|
|
|
nonvar = col[:-2]
|
|
|
|
|