I understood that __init__
method would only be called when an object is instantiated. However, some tests with Pandas DataFrames have produced some curious and surprising results that are stopping me from achieving what I wanted to achieve. Any help in understanding what is going on would be greatly appreciated.
The below:
from pandas import DataFrame, Series
class ASeries(Series):
@property
def _constructor(self):
return ASeries
@property
def _constructor_expanddim(self):
return AFrame
class AFrame(DataFrame):
a_count = 0
thing = 'a'
@property
def _constructor(self):
return AFrame
@property
def _constructor_sliced(self):
return ASeries
def __init__(self, data, *args, **kwargs):
super(AFrame, self).__init__(data, *args, **kwargs)
self.__class__.a_count += 1
print('a', self.__class__.a_count)
class BSeries(Series):
@property
def _constructor(self):
return BSeries
@property
def _constructor_expanddim(self):
return BFrame
class BFrame(AFrame):
b_count = 0
thing = 'b'
@property
def _constructor(self):
return BFrame
@property
def _constructor_sliced(self):
return BSeries
def __init__(self, data, *args, **kwargs):
super(BFrame, self).__init__(data, *args, **kwargs)
self.__class__.b_count += 1
print('b', self.__class__.b_count)
b = BFrame(
[
{
'a': 'a',
'b': 'Something',
'c': 1,
'd': 'aa',
'e': 'Somethings',
},
{
'a': 'a',
'b': 'Something else',
'c': 1,
'd': 'aa',
'e': 'Somethings',
},
],
)
print(b, type(b))
Gives this output, showing that, as I expected, each __init__
method is only being run once:
a 1
b 1
a b c d e
0 a Something 1 aa Somethings
1 a Something else 1 aa Somethings <class '__main__.BFrame'>
Process finished with exit code 0
However, if I then make some of the column names longer:
b = BFrame(
[
{
'aaaaaaaaaaaaaaaa': 'a',
'bbbbbbbbbbbbbbbb': 'Something',
'cccccccccccccccc': 1,
'dddddddddddddddd': 'aa',
'e': 'Somethings',
},
{
'aaaaaaaaaaaaaaaa': 'a',
'bbbbbbbbbbbbbbbb': 'Something else',
'cccccccccccccccc': 1,
'dddddddddddddddd': 'aa',
'e': 'Somethings',
},
],
)
print(b, type(b))
I get the below, showing that the the first subclass is running, apparently, recursively:
a 1
b 1
a 1
a 2
a 3
aaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbb ... dddddddddddddddd e
0 a Something ... aa Somethings
1 a Something else ... aa Somethings
[2 rows x 5 columns] <class '__main__.BFrame'>
Process finished with exit code 0
If I remove the print call though:
a 1
b 1
Process finished with exit code 0
If I then use the same data on the first subclass, which directly subclasses the DataFrame, the __init__
method seems to now run four times, but it's the same data as above:
a = AFrame(
[
{
'aaaaaaaaaaaaaaaa': 'a',
'bbbbbbbbbbbbbbbb': 'Something',
'cccccccccccccccc': 1,
'dddddddddddddddd': 'aa',
'e': 'Somethings',
},
{
'aaaaaaaaaaaaaaaa': 'a',
'bbbbbbbbbbbbbbbb': 'Something else',
'cccccccccccccccc': 1,
'dddddddddddddddd': 'aa',
'e': 'Somethings',
},
],
)
print(a, type(a))
Giving:
a 1
a 2
a 3
a 4
aaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbb ... dddddddddddddddd e
0 a Something ... aa Somethings
1 a Something else ... aa Somethings
[2 rows x 5 columns] <class '__main__.AFrame'>
Process finished with exit code 0
However, if I remove the print call, I get:
a 1
Process finished with exit code 0
Is anyone able to diagnose and explain what is happening here and how I can avoid this as extensions of this working are impossible to run currently but I'm hoping that understanding and fixing this behaviour will be the solution.