mirror of
https://github.com/edcommonwealth/merge-csv.git
synced 2026-03-16 02:35:57 -07:00
fix columns, remove s--.1
This commit is contained in:
parent
4d025ba681
commit
2622be1d51
1 changed files with 17 additions and 5 deletions
22
merge-csv.py
22
merge-csv.py
|
|
@ -147,24 +147,36 @@ def clean_cols(df):
|
||||||
keep = [
|
keep = [
|
||||||
'StartDate',
|
'StartDate',
|
||||||
'EndDate',
|
'EndDate',
|
||||||
|
'Start Date',
|
||||||
|
'End Date',
|
||||||
'Status',
|
'Status',
|
||||||
'IPAddress',
|
'Response Type',
|
||||||
|
'IpAddress',
|
||||||
|
'Ip Address'
|
||||||
'Progress',
|
'Progress',
|
||||||
|
'Duration',
|
||||||
|
'Please enter your Locally Assigned Student ID Number (LASID, or student lunch number).',
|
||||||
'Finished',
|
'Finished',
|
||||||
'District',
|
'District',
|
||||||
'LASID',
|
'LASID',
|
||||||
|
'Recorded Date',
|
||||||
|
'RecordedDate',
|
||||||
'Grade',
|
'Grade',
|
||||||
'Gender',
|
'Gender',
|
||||||
'Race',
|
'Race',
|
||||||
'Response ID',
|
|
||||||
'Response Id',
|
'Response Id',
|
||||||
|
'ResponseId',
|
||||||
'DeseId',
|
'DeseId',
|
||||||
'Dese Id',
|
'Dese Id',
|
||||||
|
'School',
|
||||||
|
'District',
|
||||||
|
'Please select your school district.',
|
||||||
]
|
]
|
||||||
keep = list(map(str.lower, keep))
|
keep = list(map(str.lower, keep))
|
||||||
drops = []
|
drops = []
|
||||||
|
question_pattern = re.compile("^[s,t]-[a-zA-Z]{4}-q[0-9][0-9]?$")
|
||||||
for col in df.columns:
|
for col in df.columns:
|
||||||
if col.lower() not in keep and not col.startswith('s-') and not col.startswith('t-'):
|
if col.lower() not in keep and not bool(question_pattern.match(col)):
|
||||||
drops.append(col)
|
drops.append(col)
|
||||||
df = df.drop(columns=drops)
|
df = df.drop(columns=drops)
|
||||||
if args.verbose: print(f'Dropped columns: {drops}')
|
if args.verbose: print(f'Dropped columns: {drops}')
|
||||||
|
|
@ -183,8 +195,8 @@ def do_merge_student(cwd, mwd):
|
||||||
if not args.quiet: print('Merging...')
|
if not args.quiet: print('Merging...')
|
||||||
files = [pd.read_csv(f, low_memory=False) for f in all_files]
|
files = [pd.read_csv(f, low_memory=False) for f in all_files]
|
||||||
lines = 0
|
lines = 0
|
||||||
for f in files:
|
for fi in files:
|
||||||
lines += f.shape[0]
|
lines += fi.shape[0]
|
||||||
df = pd.concat(files, axis=0)
|
df = pd.concat(files, axis=0)
|
||||||
if not args.quiet: print('Repairing rows...')
|
if not args.quiet: print('Repairing rows...')
|
||||||
df = repair_student_rows(df)
|
df = repair_student_rows(df)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue