0

Example of dictionary:

    data_noisy = {'P1': {'age': 'eighty two', 'salary': '60196.0', 'suburb': 
                 'Toorak', 'language': 'English'},
                 'P2': {'age': '49', 'salary': '-16945514.0', 'suburb': 'St. 
                 Kilda', 'language': 'Chinese'},
                 'P3': {'age': '54', 'salary': '49775.0', 'suburb': 
                 'Neverland', 'language': 'Italian'}}

Wanted output:

    data_clean = {'P1': {'age': 'None', 'salary': '60196.0', 'suburb': 
                 'Toorak', 'language': 'English'},
                 'P2': {'age': '49', 'salary': 'None', 'suburb': 'St. 
                 Kilda', 'language': 'Chinese'},
                 'P3': {'age': '54', 'salary': '49775.0', 'suburb': 'None', 
                 'language': 'Italian'}}


    MAX_SALARY = 200000

    VALID_SUBURBS = ["Richmond", "Southbank", "Fitzroy",
              "Docklands", "St. Kilda", "Footscray",
              "Hawthorn", "Parkville", "Toorak", "Brunswick",
              "Kensington", "Flemington", "Frankston", "Dandenong",
              "Caulfield", "Collingwood"]

def clean_data(data):

    data_dict = {}
    data_dict = data
    for key, value in data.items():

        for val in value.items():

            age = value['age']
            if not age.isdigit():
                data_dict['age'] = 'None'
            else:
                data_dict['age'] = value['age']

            salary = float(value['salary'])
            if salary < 0 or salary > MAX_SALARY:
                data_dict['salary'] = 'None'
            else:
                data_dict['salary'] = value['salary']

            suburb = value['suburb']
            if suburb not in VALID_SUBURBS:
                data_dict['suburb'] = 'None'
            else:
                data_dict['suburb'] = value['suburb']

    print(data_dict)

I'm wanting to not change the original dictionary so tried to copy it, then iterate to "clean" the data. Seems like I just get a RuntimeError: dictionary changed size during iteration.

Any assistance with syntax etc. with working with these nested dictionaries would be greatly appreciated.

Thanks.

Barmar
  • 741,623
  • 53
  • 500
  • 612

1 Answers1

1

Since you don't want to modify the original dicitonary but you are intending to use a copy and modifying that copy, you will need deepcopy.

from copy import deepcopy

data_clean = deepcopy(data_noisy)

for i in data_clean.values():
    if not i['age'].isdigit():
        i['age'] = 'None'
    if float(i['salary']) < 0 or float(i['salary']) > MAX_SALARY:
        i['salary'] = 'None'
    if i['suburb'] not in VALID_SUBURBS:
        i['suburb'] = 'None'

print(data_noisy)
print(data_clean)
{'P1': {'age': 'eighty two', 'salary': '60196.0', 'suburb': 'Toorak', 'language': 'English'}, 'P2': {'age': '49', 'salary': '-16945514.0', 'suburb': 'St. Kilda', 'language': 'Chinese'}, 'P3': {'age': '54', 'salary': '49775.0', 'suburb': 'Neverland', 'language': 'Italian'}}
{'P1': {'age': 'None', 'salary': '60196.0', 'suburb': 'Toorak', 'language': 'English'}, 'P2': {'age': '49', 'salary': 'None', 'suburb': 'St. Kilda', 'language': 'Chinese'}, 'P3': {'age': '54', 'salary': '49775.0', 'suburb': 'None', 'language': 'Italian'}}
vash_the_stampede
  • 4,590
  • 1
  • 8
  • 20