main
Gabe Farrell 3 years ago
parent 5a543b5e13
commit 25898233c3

4
.gitignore vendored

@ -4,4 +4,6 @@ test-*
ecp-csv
test-csv-large
merged
bfg.jar
bfg.jar
*.sh
__pycache__

Binary file not shown.

@ -9,7 +9,8 @@ import re
from urllib.parse import urlparse
import pysftp
# TODO make sure these are not case sensitive
#NOTE for now, each of the arrays should be all lowercase
#TODO eventually make them case agnostic
# all of the columns we want to extract from the csv file
# excluding the question ids (they are found using regex)
final_columns_student = {
@ -20,7 +21,7 @@ final_columns_student = {
'Progress': ['progress'],
'Duration': ['duration', 'duration..in.seconds', 'duration (in seconds)'],
'District': ['district', 'please select your school district.'],
'LASID': ['lasid', 'Please enter your Locally Assigned Student ID Number (LASID, or student lunch number).'],
'LASID': ['lasid', 'please enter your locally assigned student id number (lasid, or student lunch number).'],
'Grade': ['grade', 'what grade are you in?'],
'Gender': ['gender', 'what is your gender?', 'what is your gender? - selected choice'],
'Race': ['race'],
@ -42,6 +43,10 @@ final_columns_teacher = {
'Dese Id': ['deseid', 'dese id', 'school'],
}
argVerbose = False
argQuiet = True
class Sftp:
def __init__(self, hostname, username, password, cnopts, port=22):
"""Constructor Method"""
@ -68,12 +73,12 @@ class Sftp:
except Exception as err:
raise Exception(err)
finally:
if not args.quiet: print(f"Connected to {self.hostname} as {self.username}.")
if not argQuiet: print(f"Connected to {self.hostname} as {self.username}.")
def disconnect(self):
"""Closes the sftp connection"""
self.connection.close()
if not args.quiet: print(f"Disconnected from host {self.hostname}")
if not argQuiet: print(f"Disconnected from host {self.hostname}")
def listdir(self, remote_path):
"""lists all the files and directories in the specified path and returns them"""
@ -92,7 +97,7 @@ class Sftp:
"""
try:
if not args.quiet: print(
if not argQuiet: print(
f"downloading from {self.hostname} as {self.username} [(remote path : {remote_path});(local path: {target_local_path})]"
)
@ -106,7 +111,7 @@ class Sftp:
# Download from remote sftp server to local
self.connection.get(remote_path, target_local_path)
if not args.quiet: print("download completed")
if not argQuiet: print("download completed")
except Exception as err:
raise Exception(err)
@ -117,13 +122,13 @@ class Sftp:
"""
try:
if not args.quiet: print(
if not argQuiet: print(
f"uploading to {self.hostname} as {self.username} [(remote path: {remote_path});(source local path: {source_local_path})]"
)
# Download file from SFTP
self.connection.put(source_local_path, remote_path)
if not args.quiet: print("upload completed")
if not argQuiet: print("upload completed")
except Exception as err:
raise Exception(err)
@ -135,10 +140,10 @@ def prep_dir(folder=''):
cwd = os.path.join(os.getcwd(), folder)
mwd = os.path.join(cwd, 'merged')
if not os.path.exists(mwd):
if args.verbose: print(f'Creating directory {mwd}')
if argVerbose: print(f'Creating directory {mwd}')
os.mkdir(mwd)
if args.verbose: print('Source data directory: ' + cwd)
if args.verbose: print('Merged data directory: ' + mwd)
if argVerbose: print('Source data directory: ' + cwd)
if argVerbose: print('Merged data directory: ' + mwd)
return cwd, mwd
@ -162,13 +167,13 @@ def combine_cols(df, col, possibilities):
if cl == col:
continue
# replace the column...
if args.verbose: print(f'Replacing column {cl}')
if argVerbose: print(f'Replacing column {cl}')
df[col] = df[col].replace(r'^\s*$', np.nan, regex=True).fillna(df[cl])
# and add it to the drop list
drops.append(cl)
# drop spent columns
df = df.drop(columns=drops)
if args.verbose: print(f'Dropped columns: {drops}')
if argVerbose: print(f'Dropped columns: {drops}')
return df
@ -182,7 +187,7 @@ def clean_cols_student(df):
if col.lower() not in keep and not bool(question_pattern.match(col)):
drops.append(col)
df = df.drop(columns=drops)
if args.verbose: print(f'Dropped columns: {drops}')
if argVerbose: print(f'Dropped columns: {drops}')
return df
@ -196,20 +201,20 @@ def clean_cols_teacher(df):
if col.lower() not in keep and not bool(question_pattern.match(col)):
drops.append(col)
df = df.drop(columns=drops)
if args.verbose: print(f'Dropped columns: {drops}')
if argVerbose: print(f'Dropped columns: {drops}')
return df
# performs all merging operations for student data
def do_merge_student(cwd, mwd):
# identify and merge student files
if not args.quiet: print('---Merging Student Data---')
if not argQuiet: print('---Merging Student Data---')
all_files = glob.glob(os.path.join(cwd, "*student*.csv"))
if not args.quiet: print(f'Found {len(all_files)} Student CSV files')
if not argQuiet: print(f'Found {len(all_files)} Student CSV files')
if len(all_files) < 1:
if not args.quiet: print('No files found. Skipping merge...')
if not argQuiet: print('No files found. Skipping merge...')
return
if not args.quiet: print('Merging...')
if not argQuiet: print('Merging...')
files = [pd.read_csv(f, low_memory=False) for f in all_files]
# count lines in read csv files
lines = 0
@ -218,10 +223,10 @@ def do_merge_student(cwd, mwd):
# combine csv files
df = pd.concat(files, axis=0)
# combine related columns
if not args.quiet: print('Repairing rows...')
if not argQuiet: print('Repairing rows...')
df = repair_student_columns(df)
# clean out unnecessary columns
if not args.quiet: print('Cleaning out columns...')
if not argQuiet: print('Cleaning out columns...')
df = clean_cols_student(df)
# ensure line count matches what is expected
if df.shape[0] != lines:
@ -234,20 +239,20 @@ def do_merge_student(cwd, mwd):
proj = ''
fn = f'{date}{proj}-student-data-merged.csv'
df.to_csv(os.path.join(mwd, fn), index=False)
if not args.quiet: print('Student data merged successfully!')
if not argQuiet: print('Student data merged successfully!')
return fn
# performs all merging operations for teacher data
def do_merge_teacher(cwd, mwd):
# identify and merge teacher files
if not args.quiet: print('---Merging Teacher Data---')
if not argQuiet: print('---Merging Teacher Data---')
all_files = glob.glob(os.path.join(cwd, "*teacher*.csv"))
if not args.quiet: print(f'Found {len(all_files)} Teacher CSV files')
if not argQuiet: print(f'Found {len(all_files)} Teacher CSV files')
if len(all_files) < 1:
if not args.quiet: print('No files found. Skipping merge...')
if not argQuiet: print('No files found. Skipping merge...')
return
if not args.quiet: print('Merging...')
if not argQuiet: print('Merging...')
files = [pd.read_csv(f, low_memory=False) for f in all_files]
# count lines in read csv files
lines = 0
@ -256,10 +261,10 @@ def do_merge_teacher(cwd, mwd):
# combine csv files
df = pd.concat(files, axis=0)
# combine related columns
if not args.quiet: print('Repairing columns...')
if not argQuiet: print('Repairing columns...')
df = repair_teacher_columns(df)
# clean out unnecessary columns
if not args.quiet: print('Cleaning out columns...')
if not argQuiet: print('Cleaning out columns...')
df = clean_cols_teacher(df)
# ensure line count matches what is expected
if df.shape[0] != lines:
@ -272,7 +277,7 @@ def do_merge_teacher(cwd, mwd):
proj = ''
fn = f'{date}{proj}-teacher-data-merged.csv'
df.to_csv(os.path.join(mwd, fn), index=False)
if not args.quiet: print('Teacher data merged successfully!')
if not argQuiet: print('Teacher data merged successfully!')
return fn
@ -288,7 +293,7 @@ def repair_teacher_columns(df):
def repair_student_columns(df):
for col in final_columns_student:
df = combine_cols(df, col, final_columns_student[col])
if not args.quiet: print('Combining Question Variants...')
if not argQuiet: print('Combining Question Variants...')
df = combine_variants(df)
return df
@ -309,6 +314,7 @@ def combine_variants(df):
return df
if __name__ == '__main__':
# parse flags
@ -344,23 +350,26 @@ if __name__ == '__main__':
help='sftp url for remote merging')
args = parser.parse_args()
argVerbose = args.verbose
argQuiet = args.quiet
#quiet takes precedence over verbose
if args.quiet:
args.verbose = False
if argQuiet:
argVerbose = False
# make sure -s or -t is set
if not (args.student or args.teacher):
if not args.quiet: print('Notice: Neither -s nor -t are specified. No merge will be performed.')
if not argQuiet: print('Notice: Neither -s nor -t are specified. No merge will be performed.')
if args.directory and not args.remote_url:
c, m = prep_dir(args.directory)
elif not args.directory:
if not args.quiet: print('Notice: No directory specified. Defaulting to current directory.')
if not argQuiet: print('Notice: No directory specified. Defaulting to current directory.')
c, m = prep_dir()
# prepare sftp if flagged
if args.remote_url:
if not args.quiet: print(f'Remote destination set, fetching files...')
if not argQuiet: print(f'Remote destination set, fetching files...')
parsed_url = urlparse(args.remote_url)
cnopts = pysftp.CnOpts()
cnopts.hostkeys = None
@ -386,7 +395,7 @@ if __name__ == '__main__':
for file in sftp.listdir_attr(path):
if file.filename.endswith(".csv"):
filelist.append(file.filename)
if not args.quiet: print(f'Fetching file {file.filename}...')
if not argQuiet: print(f'Fetching file {file.filename}...')
sftp.download(path + file.filename, c + file.filename)
# perform merges
@ -397,11 +406,11 @@ if __name__ == '__main__':
if args.remote_url:
# upload tmd and smd to remote
if not args.quiet: print('Uploading merged data...')
if not argQuiet: print('Uploading merged data...')
sftp.upload(m + '/' + tmd, path + 'merged/' + tmd)
sftp.upload(m + '/' + smd, path + 'merged/' + smd)
# remove merged directory
if not args.quiet: print('Cleaning up...')
if not argQuiet: print('Cleaning up...')
os.remove(m + '/' + tmd)
os.remove(m + '/' + smd)
os.rmdir(m)
@ -410,4 +419,4 @@ if __name__ == '__main__':
if os.path.exists(f):
os.remove(f)
sftp.disconnect()
if not args.quiet: print('Done!')
if not argQuiet: print('Done!')

@ -0,0 +1,223 @@
import merge
import unittest
import pandas as pd
tdata_student = pd.DataFrame({
'BadColumn': ['1', '2', '3', '4'],
'Gender': ['1', '', '', ''],
'gender:': ['', '', '2', '1'],
'Gender - SIS': ['', '2', '', ''],
})
class TestMergeCSV(unittest.TestCase):
def test_combine_variants(self):
td = pd.DataFrame({
's-peff-q1': ['1', '2', '3', '', ''],
's-peff-q10': ['1', '2', '3', '', ''],
's-peff-q1-1': ['', '', '', '4', '5'],
's-peff-q10-1': ['', '', '', '4', '5'],
})
td = merge.combine_variants(td)
expected = pd.DataFrame({
's-peff-q1': ['1', '2', '3', '4', '5'],
's-peff-q10': ['1', '2', '3', '4', '5'],
})
notexpected = pd.DataFrame({
's-peff-q1-1': ['1', '2', '3', '4', '5'],
's-peff-q10-1': ['1', '2', '3', '4', '5'],
})
self.assertTrue(td.equals(expected))
self.assertFalse(td.equals(notexpected))
def test_clean_cols_student(self):
td = pd.DataFrame({
'Start Date': ['1', '2', '3', '4', '5'],
'End Date': ['1', '2', '3', '4', '5'],
'Status': ['1', '2', '3', '4', '5'],
'Ip Address': ['1', '2', '3', '4', '5'],
'Progress': ['1', '2', '3', '4', '5'],
'Duration': ['1', '2', '3', '4', '5'],
'District': ['1', '2', '3', '4', '5'],
'LASID': ['1', '2', '3', '4', '5'],
'Grade': ['1', '2', '3', '4', '5'],
'Race': ['1', '2', '3', '4', '5'],
'Recorded Date': ['1', '2', '3', '4', '5'],
'Dese Id': ['1', '2', '3', '4', '5'],
'BadColumn': ['x', 'x', 'x', 'x', 'x'],
'Gender - SIS': ['1', '2', '3', '', ''],
'Gender': ['1', '2', '1', '2', '2'],
'Response Id': ['1', '2', '3', '4', '5'],
's-peff-q1': ['1', '2', '3', '4', '5'],
's-peff-q10': ['1', '2', '3', '4', '5'],
's-peff-q1.1': ['1', '2', '3', '4', '5'],
})
td = merge.clean_cols_student(td)
expected = pd.DataFrame({
'Start Date': ['1', '2', '3', '4', '5'],
'End Date': ['1', '2', '3', '4', '5'],
'Status': ['1', '2', '3', '4', '5'],
'Ip Address': ['1', '2', '3', '4', '5'],
'Progress': ['1', '2', '3', '4', '5'],
'Duration': ['1', '2', '3', '4', '5'],
'District': ['1', '2', '3', '4', '5'],
'LASID': ['1', '2', '3', '4', '5'],
'Grade': ['1', '2', '3', '4', '5'],
'Race': ['1', '2', '3', '4', '5'],
'Recorded Date': ['1', '2', '3', '4', '5'],
'Dese Id': ['1', '2', '3', '4', '5'],
'Gender': ['1', '2', '1', '2', '2'],
'Response Id': ['1', '2', '3', '4', '5'],
's-peff-q1': ['1', '2', '3', '4', '5'],
's-peff-q10': ['1', '2', '3', '4', '5'],
})
notexpected = pd.DataFrame({
'BadColumn': ['x', 'x', 'x', 'x', 'x'],
'Gender - SIS': ['1', '2', '3', '', ''],
's-peff-q1.1': ['1', '2', '3', '4', '5'],
})
self.assertTrue(td.equals(expected), td)
self.assertFalse(td.equals(notexpected), td)
def test_clean_cols_teacher(self):
td = pd.DataFrame({
'Start Date': ['1', '2', '3', '4', '5'],
'End Date': ['1', '2', '3', '4', '5'],
'Status': ['1', '2', '3', '4', '5'],
'Ip Address': ['1', '2', '3', '4', '5'],
'Progress': ['1', '2', '3', '4', '5'],
'Duration': ['1', '2', '3', '4', '5'],
'District': ['1', '2', '3', '4', '5'],
'Recorded Date': ['1', '2', '3', '4', '5'],
'BadColumn': ['x', 'x', 'x', 'x', 'x'],
'Blah Blah Blah': ['x', 'x', 'x', 'x', 'x'],
'Abbey Road': ['x', 'x', 'x', 'x', 'x'],
'Please List Your Cats': ['x', 'x', 'x', 'x', 'x'],
'Response Id': ['1', '2', '3', '4', '5'],
'Dese Id': ['1', '2', '3', '4', '5'],
't-peff-q1': ['1', '2', '3', '4', '5'],
't-peff-q10': ['1', '2', '3', '4', '5'],
't-peff-q1.1': ['1', '2', '3', '4', '5'],
't-peff-q10.1': ['1', '2', '3', '4', '5'],
})
td = merge.clean_cols_teacher(td)
expected = pd.DataFrame({
'Start Date': ['1', '2', '3', '4', '5'],
'End Date': ['1', '2', '3', '4', '5'],
'Status': ['1', '2', '3', '4', '5'],
'Ip Address': ['1', '2', '3', '4', '5'],
'Progress': ['1', '2', '3', '4', '5'],
'Duration': ['1', '2', '3', '4', '5'],
'District': ['1', '2', '3', '4', '5'],
'Recorded Date': ['1', '2', '3', '4', '5'],
'Response Id': ['1', '2', '3', '4', '5'],
'Dese Id': ['1', '2', '3', '4', '5'],
't-peff-q1': ['1', '2', '3', '4', '5'],
't-peff-q10': ['1', '2', '3', '4', '5'],
})
notexpected = pd.DataFrame({
'BadColumn': ['x', 'x', 'x', 'x', 'x'],
'Blah Blah Blah': ['x', 'x', 'x', 'x', 'x'],
'Abbey Road': ['x', 'x', 'x', 'x', 'x'],
'Please List Your Cats': ['x', 'x', 'x', 'x', 'x'],
't-peff-q1.1': ['1', '2', '3', '4', '5'],
't-peff-q10.1': ['1', '2', '3', '4', '5'],
})
self.assertTrue(td.equals(expected), td)
self.assertFalse(td.equals(notexpected), td)
def test_combine_cols(self):
td = pd.DataFrame({
'My Column': ['1', '', '', '', ''],
'My Other Column': ['', '2', '3', '', ''],
'Not My Column': ['1', '2', '3', '4', '5'],
'My Last Column': ['', '', '', '4', '5'],
})
expected = pd.DataFrame({
'My Column': ['1', '2', '3', '4', '5'],
'Not My Column': ['1', '2', '3', '4', '5'],
})
td = merge.combine_cols(td, 'My Column', ['my other column', 'my last column'])
self.assertTrue(td.equals(expected), f'\n{td}')
def test_repair_cols_student(self):
td = pd.DataFrame({
'Start Date': ['', '', '', '4', '5'],
'End Date': ['', '', '', '4', '5'],
'Ip Address': ['', '', '', '4', '5'],
'StartDate': ['1', '2', '3', '', ''],
'EndDate': ['1', '2', '3', '', ''],
'IpAddress': ['1', '2', '3', '', ''],
'Status': ['1', '2', '3', '4', '5'],
'Progress': ['1', '2', '3', '4', '5'],
'Duration': ['1', '2', '3', '4', '5'],
'District': ['1', '2', '3', '4', '5'],
'Recorded Date': ['', '', '', '4', '5'],
'RecordedDate': ['1', '2', '3', '', ''],
'Response Id': ['1', '2', '3', '4', '5'],
'Dese Id': ['', '', '', '4', '5'],
'School': ['1', '2', '3', '', ''],
'LASID': ['1', '2', '3', '', ''],
'Please enter your Locally Assigned Student ID Number (LASID, or student lunch number).': ['', '', '', '4', '5'],
'Grade': ['1', '2', '3', '', ''],
'What grade are you in?': ['', '', '', '4', '5'],
'Gender': ['1', '2', '3', '', ''],
'What is your gender?': ['', '', '', '4', '5'],
'Race': ['1', '2', '3', '4', '5'],
})
expected = pd.DataFrame({
'Start Date': ['1', '2', '3', '4', '5'],
'End Date': ['1', '2', '3', '4', '5'],
'Ip Address': ['1', '2', '3', '4', '5'],
'Status': ['1', '2', '3', '4', '5'],
'Progress': ['1', '2', '3', '4', '5'],
'Duration': ['1', '2', '3', '4', '5'],
'District': ['1', '2', '3', '4', '5'],
'Recorded Date': ['1', '2', '3', '4', '5'],
'Response Id': ['1', '2', '3', '4', '5'],
'Dese Id': ['1', '2', '3', '4', '5'],
'LASID': ['1', '2', '3', '4', '5'],
'Grade': ['1', '2', '3', '4', '5'],
'Gender': ['1', '2', '3', '4', '5'],
'Race': ['1', '2', '3', '4', '5'],
})
td = merge.repair_student_columns(td)
self.assertTrue(td.equals(expected), f'\n{td}')
def test_repair_cols_teacher(self):
td = pd.DataFrame({
'Start Date': ['', '', '', '4', '5'],
'End Date': ['', '', '', '4', '5'],
'Ip Address': ['', '', '', '4', '5'],
'StartDate': ['1', '2', '3', '', ''],
'EndDate': ['1', '2', '3', '', ''],
'IpAddress': ['1', '2', '3', '', ''],
'Status': ['1', '2', '3', '4', '5'],
'Progress': ['1', '2', '3', '4', '5'],
'Duration': ['1', '2', '3', '4', '5'],
'District': ['1', '2', '3', '4', '5'],
'Recorded Date': ['', '', '', '4', '5'],
'RecordedDate': ['1', '2', '3', '', ''],
'Response Id': ['1', '2', '3', '4', '5'],
'Dese Id': ['', '', '', '4', '5'],
'School': ['1', '2', '3', '', ''],
})
expected = pd.DataFrame({
'Start Date': ['1', '2', '3', '4', '5'],
'End Date': ['1', '2', '3', '4', '5'],
'Ip Address': ['1', '2', '3', '4', '5'],
'Status': ['1', '2', '3', '4', '5'],
'Progress': ['1', '2', '3', '4', '5'],
'Duration': ['1', '2', '3', '4', '5'],
'District': ['1', '2', '3', '4', '5'],
'Recorded Date': ['1', '2', '3', '4', '5'],
'Response Id': ['1', '2', '3', '4', '5'],
'Dese Id': ['1', '2', '3', '4', '5'],
})
td = merge.repair_teacher_columns(td)
self.assertTrue(td.equals(expected), td)
if __name__ == '__main__':
unittest.main()
Loading…
Cancel
Save