mirror of
https://github.com/edcommonwealth/merge-csv.git
synced 2026-03-07 21:48:13 -08:00
Add Tests
This commit is contained in:
parent
5a543b5e13
commit
25898233c3
4 changed files with 274 additions and 40 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -5,3 +5,5 @@ ecp-csv
|
||||||
test-csv-large
|
test-csv-large
|
||||||
merged
|
merged
|
||||||
bfg.jar
|
bfg.jar
|
||||||
|
*.sh
|
||||||
|
__pycache__
|
||||||
BIN
README.md
BIN
README.md
Binary file not shown.
|
|
@ -9,7 +9,8 @@ import re
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import pysftp
|
import pysftp
|
||||||
|
|
||||||
# TODO make sure these are not case sensitive
|
#NOTE for now, each of the arrays should be all lowercase
|
||||||
|
#TODO eventually make them case agnostic
|
||||||
# all of the columns we want to extract from the csv file
|
# all of the columns we want to extract from the csv file
|
||||||
# excluding the question ids (they are found using regex)
|
# excluding the question ids (they are found using regex)
|
||||||
final_columns_student = {
|
final_columns_student = {
|
||||||
|
|
@ -20,7 +21,7 @@ final_columns_student = {
|
||||||
'Progress': ['progress'],
|
'Progress': ['progress'],
|
||||||
'Duration': ['duration', 'duration..in.seconds', 'duration (in seconds)'],
|
'Duration': ['duration', 'duration..in.seconds', 'duration (in seconds)'],
|
||||||
'District': ['district', 'please select your school district.'],
|
'District': ['district', 'please select your school district.'],
|
||||||
'LASID': ['lasid', 'Please enter your Locally Assigned Student ID Number (LASID, or student lunch number).'],
|
'LASID': ['lasid', 'please enter your locally assigned student id number (lasid, or student lunch number).'],
|
||||||
'Grade': ['grade', 'what grade are you in?'],
|
'Grade': ['grade', 'what grade are you in?'],
|
||||||
'Gender': ['gender', 'what is your gender?', 'what is your gender? - selected choice'],
|
'Gender': ['gender', 'what is your gender?', 'what is your gender? - selected choice'],
|
||||||
'Race': ['race'],
|
'Race': ['race'],
|
||||||
|
|
@ -42,6 +43,10 @@ final_columns_teacher = {
|
||||||
'Dese Id': ['deseid', 'dese id', 'school'],
|
'Dese Id': ['deseid', 'dese id', 'school'],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
argVerbose = False
|
||||||
|
argQuiet = True
|
||||||
|
|
||||||
|
|
||||||
class Sftp:
|
class Sftp:
|
||||||
def __init__(self, hostname, username, password, cnopts, port=22):
|
def __init__(self, hostname, username, password, cnopts, port=22):
|
||||||
"""Constructor Method"""
|
"""Constructor Method"""
|
||||||
|
|
@ -68,12 +73,12 @@ class Sftp:
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
raise Exception(err)
|
raise Exception(err)
|
||||||
finally:
|
finally:
|
||||||
if not args.quiet: print(f"Connected to {self.hostname} as {self.username}.")
|
if not argQuiet: print(f"Connected to {self.hostname} as {self.username}.")
|
||||||
|
|
||||||
def disconnect(self):
|
def disconnect(self):
|
||||||
"""Closes the sftp connection"""
|
"""Closes the sftp connection"""
|
||||||
self.connection.close()
|
self.connection.close()
|
||||||
if not args.quiet: print(f"Disconnected from host {self.hostname}")
|
if not argQuiet: print(f"Disconnected from host {self.hostname}")
|
||||||
|
|
||||||
def listdir(self, remote_path):
|
def listdir(self, remote_path):
|
||||||
"""lists all the files and directories in the specified path and returns them"""
|
"""lists all the files and directories in the specified path and returns them"""
|
||||||
|
|
@ -92,7 +97,7 @@ class Sftp:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not args.quiet: print(
|
if not argQuiet: print(
|
||||||
f"downloading from {self.hostname} as {self.username} [(remote path : {remote_path});(local path: {target_local_path})]"
|
f"downloading from {self.hostname} as {self.username} [(remote path : {remote_path});(local path: {target_local_path})]"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -106,7 +111,7 @@ class Sftp:
|
||||||
|
|
||||||
# Download from remote sftp server to local
|
# Download from remote sftp server to local
|
||||||
self.connection.get(remote_path, target_local_path)
|
self.connection.get(remote_path, target_local_path)
|
||||||
if not args.quiet: print("download completed")
|
if not argQuiet: print("download completed")
|
||||||
|
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
raise Exception(err)
|
raise Exception(err)
|
||||||
|
|
@ -117,13 +122,13 @@ class Sftp:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not args.quiet: print(
|
if not argQuiet: print(
|
||||||
f"uploading to {self.hostname} as {self.username} [(remote path: {remote_path});(source local path: {source_local_path})]"
|
f"uploading to {self.hostname} as {self.username} [(remote path: {remote_path});(source local path: {source_local_path})]"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Download file from SFTP
|
# Download file from SFTP
|
||||||
self.connection.put(source_local_path, remote_path)
|
self.connection.put(source_local_path, remote_path)
|
||||||
if not args.quiet: print("upload completed")
|
if not argQuiet: print("upload completed")
|
||||||
|
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
raise Exception(err)
|
raise Exception(err)
|
||||||
|
|
@ -135,10 +140,10 @@ def prep_dir(folder=''):
|
||||||
cwd = os.path.join(os.getcwd(), folder)
|
cwd = os.path.join(os.getcwd(), folder)
|
||||||
mwd = os.path.join(cwd, 'merged')
|
mwd = os.path.join(cwd, 'merged')
|
||||||
if not os.path.exists(mwd):
|
if not os.path.exists(mwd):
|
||||||
if args.verbose: print(f'Creating directory {mwd}')
|
if argVerbose: print(f'Creating directory {mwd}')
|
||||||
os.mkdir(mwd)
|
os.mkdir(mwd)
|
||||||
if args.verbose: print('Source data directory: ' + cwd)
|
if argVerbose: print('Source data directory: ' + cwd)
|
||||||
if args.verbose: print('Merged data directory: ' + mwd)
|
if argVerbose: print('Merged data directory: ' + mwd)
|
||||||
return cwd, mwd
|
return cwd, mwd
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -162,13 +167,13 @@ def combine_cols(df, col, possibilities):
|
||||||
if cl == col:
|
if cl == col:
|
||||||
continue
|
continue
|
||||||
# replace the column...
|
# replace the column...
|
||||||
if args.verbose: print(f'Replacing column {cl}')
|
if argVerbose: print(f'Replacing column {cl}')
|
||||||
df[col] = df[col].replace(r'^\s*$', np.nan, regex=True).fillna(df[cl])
|
df[col] = df[col].replace(r'^\s*$', np.nan, regex=True).fillna(df[cl])
|
||||||
# and add it to the drop list
|
# and add it to the drop list
|
||||||
drops.append(cl)
|
drops.append(cl)
|
||||||
# drop spent columns
|
# drop spent columns
|
||||||
df = df.drop(columns=drops)
|
df = df.drop(columns=drops)
|
||||||
if args.verbose: print(f'Dropped columns: {drops}')
|
if argVerbose: print(f'Dropped columns: {drops}')
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -182,7 +187,7 @@ def clean_cols_student(df):
|
||||||
if col.lower() not in keep and not bool(question_pattern.match(col)):
|
if col.lower() not in keep and not bool(question_pattern.match(col)):
|
||||||
drops.append(col)
|
drops.append(col)
|
||||||
df = df.drop(columns=drops)
|
df = df.drop(columns=drops)
|
||||||
if args.verbose: print(f'Dropped columns: {drops}')
|
if argVerbose: print(f'Dropped columns: {drops}')
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -196,20 +201,20 @@ def clean_cols_teacher(df):
|
||||||
if col.lower() not in keep and not bool(question_pattern.match(col)):
|
if col.lower() not in keep and not bool(question_pattern.match(col)):
|
||||||
drops.append(col)
|
drops.append(col)
|
||||||
df = df.drop(columns=drops)
|
df = df.drop(columns=drops)
|
||||||
if args.verbose: print(f'Dropped columns: {drops}')
|
if argVerbose: print(f'Dropped columns: {drops}')
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
# performs all merging operations for student data
|
# performs all merging operations for student data
|
||||||
def do_merge_student(cwd, mwd):
|
def do_merge_student(cwd, mwd):
|
||||||
# identify and merge student files
|
# identify and merge student files
|
||||||
if not args.quiet: print('---Merging Student Data---')
|
if not argQuiet: print('---Merging Student Data---')
|
||||||
all_files = glob.glob(os.path.join(cwd, "*student*.csv"))
|
all_files = glob.glob(os.path.join(cwd, "*student*.csv"))
|
||||||
if not args.quiet: print(f'Found {len(all_files)} Student CSV files')
|
if not argQuiet: print(f'Found {len(all_files)} Student CSV files')
|
||||||
if len(all_files) < 1:
|
if len(all_files) < 1:
|
||||||
if not args.quiet: print('No files found. Skipping merge...')
|
if not argQuiet: print('No files found. Skipping merge...')
|
||||||
return
|
return
|
||||||
if not args.quiet: print('Merging...')
|
if not argQuiet: print('Merging...')
|
||||||
files = [pd.read_csv(f, low_memory=False) for f in all_files]
|
files = [pd.read_csv(f, low_memory=False) for f in all_files]
|
||||||
# count lines in read csv files
|
# count lines in read csv files
|
||||||
lines = 0
|
lines = 0
|
||||||
|
|
@ -218,10 +223,10 @@ def do_merge_student(cwd, mwd):
|
||||||
# combine csv files
|
# combine csv files
|
||||||
df = pd.concat(files, axis=0)
|
df = pd.concat(files, axis=0)
|
||||||
# combine related columns
|
# combine related columns
|
||||||
if not args.quiet: print('Repairing rows...')
|
if not argQuiet: print('Repairing rows...')
|
||||||
df = repair_student_columns(df)
|
df = repair_student_columns(df)
|
||||||
# clean out unnecessary columns
|
# clean out unnecessary columns
|
||||||
if not args.quiet: print('Cleaning out columns...')
|
if not argQuiet: print('Cleaning out columns...')
|
||||||
df = clean_cols_student(df)
|
df = clean_cols_student(df)
|
||||||
# ensure line count matches what is expected
|
# ensure line count matches what is expected
|
||||||
if df.shape[0] != lines:
|
if df.shape[0] != lines:
|
||||||
|
|
@ -234,20 +239,20 @@ def do_merge_student(cwd, mwd):
|
||||||
proj = ''
|
proj = ''
|
||||||
fn = f'{date}{proj}-student-data-merged.csv'
|
fn = f'{date}{proj}-student-data-merged.csv'
|
||||||
df.to_csv(os.path.join(mwd, fn), index=False)
|
df.to_csv(os.path.join(mwd, fn), index=False)
|
||||||
if not args.quiet: print('Student data merged successfully!')
|
if not argQuiet: print('Student data merged successfully!')
|
||||||
return fn
|
return fn
|
||||||
|
|
||||||
|
|
||||||
# performs all merging operations for teacher data
|
# performs all merging operations for teacher data
|
||||||
def do_merge_teacher(cwd, mwd):
|
def do_merge_teacher(cwd, mwd):
|
||||||
# identify and merge teacher files
|
# identify and merge teacher files
|
||||||
if not args.quiet: print('---Merging Teacher Data---')
|
if not argQuiet: print('---Merging Teacher Data---')
|
||||||
all_files = glob.glob(os.path.join(cwd, "*teacher*.csv"))
|
all_files = glob.glob(os.path.join(cwd, "*teacher*.csv"))
|
||||||
if not args.quiet: print(f'Found {len(all_files)} Teacher CSV files')
|
if not argQuiet: print(f'Found {len(all_files)} Teacher CSV files')
|
||||||
if len(all_files) < 1:
|
if len(all_files) < 1:
|
||||||
if not args.quiet: print('No files found. Skipping merge...')
|
if not argQuiet: print('No files found. Skipping merge...')
|
||||||
return
|
return
|
||||||
if not args.quiet: print('Merging...')
|
if not argQuiet: print('Merging...')
|
||||||
files = [pd.read_csv(f, low_memory=False) for f in all_files]
|
files = [pd.read_csv(f, low_memory=False) for f in all_files]
|
||||||
# count lines in read csv files
|
# count lines in read csv files
|
||||||
lines = 0
|
lines = 0
|
||||||
|
|
@ -256,10 +261,10 @@ def do_merge_teacher(cwd, mwd):
|
||||||
# combine csv files
|
# combine csv files
|
||||||
df = pd.concat(files, axis=0)
|
df = pd.concat(files, axis=0)
|
||||||
# combine related columns
|
# combine related columns
|
||||||
if not args.quiet: print('Repairing columns...')
|
if not argQuiet: print('Repairing columns...')
|
||||||
df = repair_teacher_columns(df)
|
df = repair_teacher_columns(df)
|
||||||
# clean out unnecessary columns
|
# clean out unnecessary columns
|
||||||
if not args.quiet: print('Cleaning out columns...')
|
if not argQuiet: print('Cleaning out columns...')
|
||||||
df = clean_cols_teacher(df)
|
df = clean_cols_teacher(df)
|
||||||
# ensure line count matches what is expected
|
# ensure line count matches what is expected
|
||||||
if df.shape[0] != lines:
|
if df.shape[0] != lines:
|
||||||
|
|
@ -272,7 +277,7 @@ def do_merge_teacher(cwd, mwd):
|
||||||
proj = ''
|
proj = ''
|
||||||
fn = f'{date}{proj}-teacher-data-merged.csv'
|
fn = f'{date}{proj}-teacher-data-merged.csv'
|
||||||
df.to_csv(os.path.join(mwd, fn), index=False)
|
df.to_csv(os.path.join(mwd, fn), index=False)
|
||||||
if not args.quiet: print('Teacher data merged successfully!')
|
if not argQuiet: print('Teacher data merged successfully!')
|
||||||
return fn
|
return fn
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -288,7 +293,7 @@ def repair_teacher_columns(df):
|
||||||
def repair_student_columns(df):
|
def repair_student_columns(df):
|
||||||
for col in final_columns_student:
|
for col in final_columns_student:
|
||||||
df = combine_cols(df, col, final_columns_student[col])
|
df = combine_cols(df, col, final_columns_student[col])
|
||||||
if not args.quiet: print('Combining Question Variants...')
|
if not argQuiet: print('Combining Question Variants...')
|
||||||
df = combine_variants(df)
|
df = combine_variants(df)
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
@ -309,6 +314,7 @@ def combine_variants(df):
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
# parse flags
|
# parse flags
|
||||||
|
|
@ -344,23 +350,26 @@ if __name__ == '__main__':
|
||||||
help='sftp url for remote merging')
|
help='sftp url for remote merging')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
argVerbose = args.verbose
|
||||||
|
argQuiet = args.quiet
|
||||||
|
|
||||||
#quiet takes precedence over verbose
|
#quiet takes precedence over verbose
|
||||||
if args.quiet:
|
if argQuiet:
|
||||||
args.verbose = False
|
argVerbose = False
|
||||||
|
|
||||||
# make sure -s or -t is set
|
# make sure -s or -t is set
|
||||||
if not (args.student or args.teacher):
|
if not (args.student or args.teacher):
|
||||||
if not args.quiet: print('Notice: Neither -s nor -t are specified. No merge will be performed.')
|
if not argQuiet: print('Notice: Neither -s nor -t are specified. No merge will be performed.')
|
||||||
|
|
||||||
if args.directory and not args.remote_url:
|
if args.directory and not args.remote_url:
|
||||||
c, m = prep_dir(args.directory)
|
c, m = prep_dir(args.directory)
|
||||||
elif not args.directory:
|
elif not args.directory:
|
||||||
if not args.quiet: print('Notice: No directory specified. Defaulting to current directory.')
|
if not argQuiet: print('Notice: No directory specified. Defaulting to current directory.')
|
||||||
c, m = prep_dir()
|
c, m = prep_dir()
|
||||||
|
|
||||||
# prepare sftp if flagged
|
# prepare sftp if flagged
|
||||||
if args.remote_url:
|
if args.remote_url:
|
||||||
if not args.quiet: print(f'Remote destination set, fetching files...')
|
if not argQuiet: print(f'Remote destination set, fetching files...')
|
||||||
parsed_url = urlparse(args.remote_url)
|
parsed_url = urlparse(args.remote_url)
|
||||||
cnopts = pysftp.CnOpts()
|
cnopts = pysftp.CnOpts()
|
||||||
cnopts.hostkeys = None
|
cnopts.hostkeys = None
|
||||||
|
|
@ -386,7 +395,7 @@ if __name__ == '__main__':
|
||||||
for file in sftp.listdir_attr(path):
|
for file in sftp.listdir_attr(path):
|
||||||
if file.filename.endswith(".csv"):
|
if file.filename.endswith(".csv"):
|
||||||
filelist.append(file.filename)
|
filelist.append(file.filename)
|
||||||
if not args.quiet: print(f'Fetching file {file.filename}...')
|
if not argQuiet: print(f'Fetching file {file.filename}...')
|
||||||
sftp.download(path + file.filename, c + file.filename)
|
sftp.download(path + file.filename, c + file.filename)
|
||||||
|
|
||||||
# perform merges
|
# perform merges
|
||||||
|
|
@ -397,11 +406,11 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
if args.remote_url:
|
if args.remote_url:
|
||||||
# upload tmd and smd to remote
|
# upload tmd and smd to remote
|
||||||
if not args.quiet: print('Uploading merged data...')
|
if not argQuiet: print('Uploading merged data...')
|
||||||
sftp.upload(m + '/' + tmd, path + 'merged/' + tmd)
|
sftp.upload(m + '/' + tmd, path + 'merged/' + tmd)
|
||||||
sftp.upload(m + '/' + smd, path + 'merged/' + smd)
|
sftp.upload(m + '/' + smd, path + 'merged/' + smd)
|
||||||
# remove merged directory
|
# remove merged directory
|
||||||
if not args.quiet: print('Cleaning up...')
|
if not argQuiet: print('Cleaning up...')
|
||||||
os.remove(m + '/' + tmd)
|
os.remove(m + '/' + tmd)
|
||||||
os.remove(m + '/' + smd)
|
os.remove(m + '/' + smd)
|
||||||
os.rmdir(m)
|
os.rmdir(m)
|
||||||
|
|
@ -410,4 +419,4 @@ if __name__ == '__main__':
|
||||||
if os.path.exists(f):
|
if os.path.exists(f):
|
||||||
os.remove(f)
|
os.remove(f)
|
||||||
sftp.disconnect()
|
sftp.disconnect()
|
||||||
if not args.quiet: print('Done!')
|
if not argQuiet: print('Done!')
|
||||||
223
test.py
Normal file
223
test.py
Normal file
|
|
@ -0,0 +1,223 @@
|
||||||
|
import merge
|
||||||
|
import unittest
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
tdata_student = pd.DataFrame({
|
||||||
|
'BadColumn': ['1', '2', '3', '4'],
|
||||||
|
'Gender': ['1', '', '', ''],
|
||||||
|
'gender:': ['', '', '2', '1'],
|
||||||
|
'Gender - SIS': ['', '2', '', ''],
|
||||||
|
|
||||||
|
})
|
||||||
|
|
||||||
|
class TestMergeCSV(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_combine_variants(self):
|
||||||
|
td = pd.DataFrame({
|
||||||
|
's-peff-q1': ['1', '2', '3', '', ''],
|
||||||
|
's-peff-q10': ['1', '2', '3', '', ''],
|
||||||
|
's-peff-q1-1': ['', '', '', '4', '5'],
|
||||||
|
's-peff-q10-1': ['', '', '', '4', '5'],
|
||||||
|
})
|
||||||
|
td = merge.combine_variants(td)
|
||||||
|
expected = pd.DataFrame({
|
||||||
|
's-peff-q1': ['1', '2', '3', '4', '5'],
|
||||||
|
's-peff-q10': ['1', '2', '3', '4', '5'],
|
||||||
|
})
|
||||||
|
notexpected = pd.DataFrame({
|
||||||
|
's-peff-q1-1': ['1', '2', '3', '4', '5'],
|
||||||
|
's-peff-q10-1': ['1', '2', '3', '4', '5'],
|
||||||
|
})
|
||||||
|
self.assertTrue(td.equals(expected))
|
||||||
|
self.assertFalse(td.equals(notexpected))
|
||||||
|
|
||||||
|
def test_clean_cols_student(self):
|
||||||
|
td = pd.DataFrame({
|
||||||
|
'Start Date': ['1', '2', '3', '4', '5'],
|
||||||
|
'End Date': ['1', '2', '3', '4', '5'],
|
||||||
|
'Status': ['1', '2', '3', '4', '5'],
|
||||||
|
'Ip Address': ['1', '2', '3', '4', '5'],
|
||||||
|
'Progress': ['1', '2', '3', '4', '5'],
|
||||||
|
'Duration': ['1', '2', '3', '4', '5'],
|
||||||
|
'District': ['1', '2', '3', '4', '5'],
|
||||||
|
'LASID': ['1', '2', '3', '4', '5'],
|
||||||
|
'Grade': ['1', '2', '3', '4', '5'],
|
||||||
|
'Race': ['1', '2', '3', '4', '5'],
|
||||||
|
'Recorded Date': ['1', '2', '3', '4', '5'],
|
||||||
|
'Dese Id': ['1', '2', '3', '4', '5'],
|
||||||
|
'BadColumn': ['x', 'x', 'x', 'x', 'x'],
|
||||||
|
'Gender - SIS': ['1', '2', '3', '', ''],
|
||||||
|
'Gender': ['1', '2', '1', '2', '2'],
|
||||||
|
'Response Id': ['1', '2', '3', '4', '5'],
|
||||||
|
's-peff-q1': ['1', '2', '3', '4', '5'],
|
||||||
|
's-peff-q10': ['1', '2', '3', '4', '5'],
|
||||||
|
's-peff-q1.1': ['1', '2', '3', '4', '5'],
|
||||||
|
})
|
||||||
|
td = merge.clean_cols_student(td)
|
||||||
|
expected = pd.DataFrame({
|
||||||
|
'Start Date': ['1', '2', '3', '4', '5'],
|
||||||
|
'End Date': ['1', '2', '3', '4', '5'],
|
||||||
|
'Status': ['1', '2', '3', '4', '5'],
|
||||||
|
'Ip Address': ['1', '2', '3', '4', '5'],
|
||||||
|
'Progress': ['1', '2', '3', '4', '5'],
|
||||||
|
'Duration': ['1', '2', '3', '4', '5'],
|
||||||
|
'District': ['1', '2', '3', '4', '5'],
|
||||||
|
'LASID': ['1', '2', '3', '4', '5'],
|
||||||
|
'Grade': ['1', '2', '3', '4', '5'],
|
||||||
|
'Race': ['1', '2', '3', '4', '5'],
|
||||||
|
'Recorded Date': ['1', '2', '3', '4', '5'],
|
||||||
|
'Dese Id': ['1', '2', '3', '4', '5'],
|
||||||
|
'Gender': ['1', '2', '1', '2', '2'],
|
||||||
|
'Response Id': ['1', '2', '3', '4', '5'],
|
||||||
|
's-peff-q1': ['1', '2', '3', '4', '5'],
|
||||||
|
's-peff-q10': ['1', '2', '3', '4', '5'],
|
||||||
|
})
|
||||||
|
notexpected = pd.DataFrame({
|
||||||
|
'BadColumn': ['x', 'x', 'x', 'x', 'x'],
|
||||||
|
'Gender - SIS': ['1', '2', '3', '', ''],
|
||||||
|
's-peff-q1.1': ['1', '2', '3', '4', '5'],
|
||||||
|
})
|
||||||
|
self.assertTrue(td.equals(expected), td)
|
||||||
|
self.assertFalse(td.equals(notexpected), td)
|
||||||
|
|
||||||
|
def test_clean_cols_teacher(self):
|
||||||
|
td = pd.DataFrame({
|
||||||
|
'Start Date': ['1', '2', '3', '4', '5'],
|
||||||
|
'End Date': ['1', '2', '3', '4', '5'],
|
||||||
|
'Status': ['1', '2', '3', '4', '5'],
|
||||||
|
'Ip Address': ['1', '2', '3', '4', '5'],
|
||||||
|
'Progress': ['1', '2', '3', '4', '5'],
|
||||||
|
'Duration': ['1', '2', '3', '4', '5'],
|
||||||
|
'District': ['1', '2', '3', '4', '5'],
|
||||||
|
'Recorded Date': ['1', '2', '3', '4', '5'],
|
||||||
|
'BadColumn': ['x', 'x', 'x', 'x', 'x'],
|
||||||
|
'Blah Blah Blah': ['x', 'x', 'x', 'x', 'x'],
|
||||||
|
'Abbey Road': ['x', 'x', 'x', 'x', 'x'],
|
||||||
|
'Please List Your Cats': ['x', 'x', 'x', 'x', 'x'],
|
||||||
|
'Response Id': ['1', '2', '3', '4', '5'],
|
||||||
|
'Dese Id': ['1', '2', '3', '4', '5'],
|
||||||
|
't-peff-q1': ['1', '2', '3', '4', '5'],
|
||||||
|
't-peff-q10': ['1', '2', '3', '4', '5'],
|
||||||
|
't-peff-q1.1': ['1', '2', '3', '4', '5'],
|
||||||
|
't-peff-q10.1': ['1', '2', '3', '4', '5'],
|
||||||
|
})
|
||||||
|
td = merge.clean_cols_teacher(td)
|
||||||
|
expected = pd.DataFrame({
|
||||||
|
'Start Date': ['1', '2', '3', '4', '5'],
|
||||||
|
'End Date': ['1', '2', '3', '4', '5'],
|
||||||
|
'Status': ['1', '2', '3', '4', '5'],
|
||||||
|
'Ip Address': ['1', '2', '3', '4', '5'],
|
||||||
|
'Progress': ['1', '2', '3', '4', '5'],
|
||||||
|
'Duration': ['1', '2', '3', '4', '5'],
|
||||||
|
'District': ['1', '2', '3', '4', '5'],
|
||||||
|
'Recorded Date': ['1', '2', '3', '4', '5'],
|
||||||
|
'Response Id': ['1', '2', '3', '4', '5'],
|
||||||
|
'Dese Id': ['1', '2', '3', '4', '5'],
|
||||||
|
't-peff-q1': ['1', '2', '3', '4', '5'],
|
||||||
|
't-peff-q10': ['1', '2', '3', '4', '5'],
|
||||||
|
})
|
||||||
|
notexpected = pd.DataFrame({
|
||||||
|
'BadColumn': ['x', 'x', 'x', 'x', 'x'],
|
||||||
|
'Blah Blah Blah': ['x', 'x', 'x', 'x', 'x'],
|
||||||
|
'Abbey Road': ['x', 'x', 'x', 'x', 'x'],
|
||||||
|
'Please List Your Cats': ['x', 'x', 'x', 'x', 'x'],
|
||||||
|
't-peff-q1.1': ['1', '2', '3', '4', '5'],
|
||||||
|
't-peff-q10.1': ['1', '2', '3', '4', '5'],
|
||||||
|
})
|
||||||
|
self.assertTrue(td.equals(expected), td)
|
||||||
|
self.assertFalse(td.equals(notexpected), td)
|
||||||
|
|
||||||
|
def test_combine_cols(self):
|
||||||
|
td = pd.DataFrame({
|
||||||
|
'My Column': ['1', '', '', '', ''],
|
||||||
|
'My Other Column': ['', '2', '3', '', ''],
|
||||||
|
'Not My Column': ['1', '2', '3', '4', '5'],
|
||||||
|
'My Last Column': ['', '', '', '4', '5'],
|
||||||
|
})
|
||||||
|
expected = pd.DataFrame({
|
||||||
|
'My Column': ['1', '2', '3', '4', '5'],
|
||||||
|
'Not My Column': ['1', '2', '3', '4', '5'],
|
||||||
|
})
|
||||||
|
td = merge.combine_cols(td, 'My Column', ['my other column', 'my last column'])
|
||||||
|
self.assertTrue(td.equals(expected), f'\n{td}')
|
||||||
|
|
||||||
|
def test_repair_cols_student(self):
|
||||||
|
td = pd.DataFrame({
|
||||||
|
'Start Date': ['', '', '', '4', '5'],
|
||||||
|
'End Date': ['', '', '', '4', '5'],
|
||||||
|
'Ip Address': ['', '', '', '4', '5'],
|
||||||
|
'StartDate': ['1', '2', '3', '', ''],
|
||||||
|
'EndDate': ['1', '2', '3', '', ''],
|
||||||
|
'IpAddress': ['1', '2', '3', '', ''],
|
||||||
|
'Status': ['1', '2', '3', '4', '5'],
|
||||||
|
'Progress': ['1', '2', '3', '4', '5'],
|
||||||
|
'Duration': ['1', '2', '3', '4', '5'],
|
||||||
|
'District': ['1', '2', '3', '4', '5'],
|
||||||
|
'Recorded Date': ['', '', '', '4', '5'],
|
||||||
|
'RecordedDate': ['1', '2', '3', '', ''],
|
||||||
|
'Response Id': ['1', '2', '3', '4', '5'],
|
||||||
|
'Dese Id': ['', '', '', '4', '5'],
|
||||||
|
'School': ['1', '2', '3', '', ''],
|
||||||
|
'LASID': ['1', '2', '3', '', ''],
|
||||||
|
'Please enter your Locally Assigned Student ID Number (LASID, or student lunch number).': ['', '', '', '4', '5'],
|
||||||
|
'Grade': ['1', '2', '3', '', ''],
|
||||||
|
'What grade are you in?': ['', '', '', '4', '5'],
|
||||||
|
'Gender': ['1', '2', '3', '', ''],
|
||||||
|
'What is your gender?': ['', '', '', '4', '5'],
|
||||||
|
'Race': ['1', '2', '3', '4', '5'],
|
||||||
|
})
|
||||||
|
expected = pd.DataFrame({
|
||||||
|
'Start Date': ['1', '2', '3', '4', '5'],
|
||||||
|
'End Date': ['1', '2', '3', '4', '5'],
|
||||||
|
'Ip Address': ['1', '2', '3', '4', '5'],
|
||||||
|
'Status': ['1', '2', '3', '4', '5'],
|
||||||
|
'Progress': ['1', '2', '3', '4', '5'],
|
||||||
|
'Duration': ['1', '2', '3', '4', '5'],
|
||||||
|
'District': ['1', '2', '3', '4', '5'],
|
||||||
|
'Recorded Date': ['1', '2', '3', '4', '5'],
|
||||||
|
'Response Id': ['1', '2', '3', '4', '5'],
|
||||||
|
'Dese Id': ['1', '2', '3', '4', '5'],
|
||||||
|
'LASID': ['1', '2', '3', '4', '5'],
|
||||||
|
'Grade': ['1', '2', '3', '4', '5'],
|
||||||
|
'Gender': ['1', '2', '3', '4', '5'],
|
||||||
|
'Race': ['1', '2', '3', '4', '5'],
|
||||||
|
})
|
||||||
|
td = merge.repair_student_columns(td)
|
||||||
|
self.assertTrue(td.equals(expected), f'\n{td}')
|
||||||
|
|
||||||
|
def test_repair_cols_teacher(self):
|
||||||
|
td = pd.DataFrame({
|
||||||
|
'Start Date': ['', '', '', '4', '5'],
|
||||||
|
'End Date': ['', '', '', '4', '5'],
|
||||||
|
'Ip Address': ['', '', '', '4', '5'],
|
||||||
|
'StartDate': ['1', '2', '3', '', ''],
|
||||||
|
'EndDate': ['1', '2', '3', '', ''],
|
||||||
|
'IpAddress': ['1', '2', '3', '', ''],
|
||||||
|
'Status': ['1', '2', '3', '4', '5'],
|
||||||
|
'Progress': ['1', '2', '3', '4', '5'],
|
||||||
|
'Duration': ['1', '2', '3', '4', '5'],
|
||||||
|
'District': ['1', '2', '3', '4', '5'],
|
||||||
|
'Recorded Date': ['', '', '', '4', '5'],
|
||||||
|
'RecordedDate': ['1', '2', '3', '', ''],
|
||||||
|
'Response Id': ['1', '2', '3', '4', '5'],
|
||||||
|
'Dese Id': ['', '', '', '4', '5'],
|
||||||
|
'School': ['1', '2', '3', '', ''],
|
||||||
|
})
|
||||||
|
expected = pd.DataFrame({
|
||||||
|
'Start Date': ['1', '2', '3', '4', '5'],
|
||||||
|
'End Date': ['1', '2', '3', '4', '5'],
|
||||||
|
'Ip Address': ['1', '2', '3', '4', '5'],
|
||||||
|
'Status': ['1', '2', '3', '4', '5'],
|
||||||
|
'Progress': ['1', '2', '3', '4', '5'],
|
||||||
|
'Duration': ['1', '2', '3', '4', '5'],
|
||||||
|
'District': ['1', '2', '3', '4', '5'],
|
||||||
|
'Recorded Date': ['1', '2', '3', '4', '5'],
|
||||||
|
'Response Id': ['1', '2', '3', '4', '5'],
|
||||||
|
'Dese Id': ['1', '2', '3', '4', '5'],
|
||||||
|
})
|
||||||
|
td = merge.repair_teacher_columns(td)
|
||||||
|
self.assertTrue(td.equals(expected), td)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
||||||
Loading…
Add table
Add a link
Reference in a new issue