Am trying to work on an excel sheet with less than 50k rows. What am trying to do is - using a particular column, I want to get all unique values and then by using the unique values, I would like to get all rows that contain that value, and put them in this format
[{
"unique_field_value": [Array containing row data that match the unique value as dictionaries]
},]
The thing is when I test with less rows like 1000 rows all goes well. As the number grows, memory usage also increases until it can't grow no more and my PC freezes. So, is there something am not doing right with pandas? .Here are details of my platform:
DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=16.04
DISTRIB_CODENAME=xenial
DISTRIB_DESCRIPTION="Ubuntu 16.04.3 LTS"
NAME="Ubuntu"
VERSION="16.04.3 LTS (Xenial Xerus)"
ID_LIKE=debian
VERSION_ID="16.04"
Here is my code am am running on Jupyter Notebook
import pandas as pd
import simplejson
import datetime
def datetime_handler(x):
if isinstance(x, datetime.datetime):
return x.isoformat()
raise TypeError("Type not Known")
path = "/home/misachi/Downloads/new members/my_file.xls"
df = pd.read_excel(path, index_col=None, skiprows=[0])
df = df.dropna(thresh=5)
df2 = df.drop_duplicates(subset=['corporate'])
schemes = df2['corporate'].values
result_list = []
result_dict = {}
for count, name in enumerate(schemes):
inner_dict = {}
col_val = schemes[count]
foo = df['corporate'] == col_val
data = df[foo].to_json(orient='records', date_format='iso')
result_dict[name] = simplejson.loads(data)
result_list.append(result_dict)
# print(result_list)
# if count == 3:
# break
dumped = simplejson.dumps(result_list, ignore_nan=True, default=datetime_handler)
with open('/home/misachi/Downloads/new members/members/folder/insurance.json', 'w') as json_f:
json_f.write(dumped)
EDIT
Here is the sample output expected
[{
"TABBY MEMORIAL CATHEDRAL": [{
"corp_id": 8494,
"smart": null,
"copay": null,
"corporate": "TABBY MEMORIAL CATHEDRAL",
"category": "CAT A",
"member_names": "Brian Maombi",
"member_no": "84984",
"start_date": "2017-03-01T00:00:00.000Z",
"end_date": "2018-02-28T00:00:00.000Z",
"outpatient": "OUTPATIENT"
}, {
"corp_id": 8494,
"smart": null,
"copay": null,
"corporate": "TABBY MEMORIAL CATHEDRAL",
"category": "CAT A",
"member_names": "Omula Peter",
"member_no": "4784984",
"start_date": "2017-03-01T00:00:00.000Z",
"end_date": "2018-02-28T00:00:00.000Z",
"outpatient": "OUTPATIENT"
}],
"CHECKIFY KENYA LTD": [{
"corp_id": 7489,
"smart": "SMART",
"copay": null,
"corporate": "CHECKIFY KENYA LTD",
"category": "CAT A",
"member_names": "BENARD KONYI",
"member_no": "ABB/8439",
"start_date": "2017-08-01T00:00:00.000Z",
"end_date": "2018-07-31T00:00:00.000Z",
"outpatient": "OUTPATIENT"
}, {
"corp_id": 7489,
"smart": "SMART",
"copay": null,
"corporate": "CHECKIFY KENYA LTD",
"category": "CAT A",
"member_names": "KEVIN WACHAI",
"member_no": "ABB/67484",
"start_date": "2017-08-01T00:00:00.000Z",
"end_date": "2018-07-31T00:00:00.000Z",
"outpatient": "OUTPATIENT"
}]
}]
The complete and cleaner code is:
import os
import pandas as pd
import simplejson
import datetime
def datetime_handler(x):
if isinstance(x, datetime.datetime):
return x.isoformat()
raise TypeError("Unknown type")
def work_on_data(filename):
if not os.path.isfile(filename):
raise IOError
df = pd.read_excel(filename, index_col=None, skiprows=[0])
df = df.dropna(thresh=5)
result_list = [{n: g.to_dict('records')} for n, g in df.groupby('corporate')]
dumped = simplejson.dumps(result_list, ignore_nan=True, default=datetime_handler)
return dumped
dumped = work_on_data('/home/misachi/Downloads/new members/my_file.xls')
with open('/home/misachi/Downloads/new members/members/folder/insurance.json', 'w') as json_f:
json_f.write(dumped)