The default dict in My following code returns non-default value.
from collections import defaultdict
from typing import List, AnyStr
import unittest
import pandas as pd
import enum
import logging
from copy import copy
from pprint import pprint
logging.basicConfig(level=logging.NOTSET)
class CategoryEncoder:
"""Given a list of set of categories, build up indexes corresponding to
each category. Each category in each column is mapped to a unique value
(here called indexes). """
def __init__(self, all_categories:List[List[AnyStr]]):
"""
Args:
all_categories (List[List[Str]]): List of set of categories. eg.
[
['A', 'B', 'C'],
['D', 'E'],
['F', 'G', 'H', 'I', 'J']
]
The sets need not be of the same size
"""
self.all_categories = all_categories
# a list containing category's mapping to an index.
# self.index[i] represents the mapping corresponding to i'th column in dataset(csv)
self.index, self.offset, offset = [], 0, 0
for column_index, categories in enumerate(all_categories):
categories = set(categories)
logging.debug(f'{column_index}, {offset}')
self.index.append(defaultdict(lambda: offset))
for index, category in enumerate(categories):
self.index[column_index][category] = index + 1 + offset
offset += len(categories) + 1
self.offset = offset
def get_index(self, column_index, word):
return self.index[column_index][word]
def __len__(self):
return self.offset
if __name__ == '__main__':
all_categories = [
['A', 'B', 'C'],
['D', 'E'],
['F', 'G', 'H', 'I', 'J']
]
encoder = CategoryEncoder(all_categories)
print('#', encoder.index[0]['#'])
pprint(encoder.index)
with output:
DEBUG:root:0, 0
DEBUG:root:1, 4
DEBUG:root:2, 7
# 13
[defaultdict(<function CategoryEncoder.__init__.<locals>.<lambda> at 0x7fbef2dbe280>,
{'#': 13,
'A': 3,
'B': 2,
'C': 1}),
defaultdict(<function CategoryEncoder.__init__.<locals>.<lambda> at 0x7fbedbee6ee0>,
{'D': 5,
'E': 6}),
defaultdict(<function CategoryEncoder.__init__.<locals>.<lambda> at 0x7fbedbee6f70>,
{'F': 11,
'G': 10,
'H': 12,
'I': 9,
'J': 8})]
Some details about the code (to make it easy to understand):
- The code takes strings grouped in lists, and assigns a unique id to each of them. For each group, it saves a unique id that is to be assigned to any other string that were not present. eg. for group ['A', 'B', 'C'], the id 0 is saved and is the default value of the defaultdict corresponding to that key. (see
logging.debug(f'{column_index}, {offset}')
line) - self.index is a list that is initially empty, but at the end contains default dicts corresponding to each list in the argument 'all_categories'. It is printed at the end. See the output for better understanding.
The problem:
The logging.debug statement shows that the defaultdict stored at index 0 of the list self.index, has a default value 0, but when tried to access it print('#', encoder.index[0]['#'])
, it returns 13.
I am not sure why so.