EDIT:
There is a bug in pandas 1.0.5, after upgrading to 1.1.1, there is no error.
I have a pandas dataframe that looks like that:
Name Date Price Label Y Z
foo1 1/1/20 100 1 _ _
foo1 1/1/20 200 2 _ _
. . . . . .
. . . . . .
foo1 1/8/20 240 1 _ _
foo2 1/2/20 500 1 _ _
. . . . . .
. . . . . .
foo2 1/7/20 423 4 _ _
. . . . . .
. . . . . .
- There are 80 unique values for the
Name
column, i.e. foo1 - foo80 - There are 20 unique
Date
values - There are 4 unique
Label
values - Y and Z columns are irrelevant for the new dataframe
I want to create a table s.t it will has 80 rows(each for each Name), and 20*4 + 1 columns (20x4 for each Date-Label combination, and 1 for the Name).
The final dataframe should look as follows:
**Name 1/1/20(Label1) 1/1/20(Label2) 1/1/20(Label3) 1/1/20(Label4) 1/2/20(Label1) ... 4/7/20(Label4)**
foo1 100 200 300 -1 -1 -1
foo2 -1 -1 -1 -1 500 -1
...............
...............
-1 indicates that there is not entry in the original entry for the specific Name-Date-Label combination.
I'm basically new to pandas, and I can certainly build the dataframe iteratively by hand(if..else solution) but I believe that there is a faster, readable, and easier solution.
> df.columns
Index(['A', 'B', 'Date', 'C',
'D', 'Price', 'Label', 'E',
'Name', 'F', 'G', 'H', 'I',
'J'],
dtype='object')
> df.head(10).to_dict('list')
{'A': [160, 457, 457, 482, 482, 482, 482, 423, 223, 506],
'B': ['8/27/2015 0:00',
'10/15/2015 0:00',
'10/15/2015 0:00',
'10/28/2015 0:00',
'10/28/2015 0:00',
'10/28/2015 0:00',
'10/28/2015 0:00',
'9/29/2015 0:00',
'9/9/2015 0:00',
'11/9/2015 0:00'],
'Date': ['8/28/2015 0:00',
'10/16/2015 0:00',
'10/16/2015 0:00',
'10/29/2015 0:00',
'10/29/2015 0:00',
'10/29/2015 0:00',
'10/29/2015 0:00',
'9/30/2015 0:00',
'9/10/2015 0:00',
'11/10/2015 0:00'],
'C': [5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
'D': [1271, 1825, 1825, 1455, 1455, 1455, 1455, 2522, 1385, 1765],
'Price': [1058, 1685, 1615, 1195, 1255, 1279, 1295, 2285, 1285, 1665],
'Label': [3, 3, 2, 1, 3, 4, 2, 2, 1, 4],
'E': [13, 127, 127, -1, -1, -1, -1, -1, -1, -1],
'Name': ['foo1',
'foo2',
'foo2',
'foo3',
'foo3',
'foo3',
'foo3',
'foo4',
'foo4',
'foo3'],
'F': [4, 4, 4, 3, 3, 3, 3, 3, 3, 3],
'G': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
'H': ['Friday',
'Friday',
'Friday',
'Thursday',
'Thursday',
'Thursday',
'Thursday',
'Wednesday',
'Thursday',
'Tuesday'],
'I': [213, 140, 210, 260, 200, 176, 160, 237, 100, 100],
'J': [16.758457907159716,
7.671232876712329,
11.506849315068493,
17.869415807560138,
13.745704467353955,
12.096219931271474,
10.996563573883162,
9.397303727200637,
7.220216606498194,
5.6657223796034]}
Using
import pandas_datareader.data as web
import pandas as pd
import numpy as np
from datetime import datetime
df = pd.DataFrame({
# 'A': [160, 457, 457, 482, 482, 482, 482, 423, 223, 506],
# 'B': ['8/27/2015 0:00','10/15/2015 0:00','10/15/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','10/28/2015 0:00','9/29/2015 0:00','9/9/2015 0:00','11/9/2015 0:00'],
'Date': ['8/28/2015 0:00','10/16/2015 0:00','10/16/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','10/29/2015 0:00','9/30/2015 0:00','9/10/2015 0:00','11/10/2015 0:00'],
# 'C': [5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
# 'D': [1271, 1825, 1825, 1455, 1455, 1455, 1455, 2522, 1385, 1765],
'Price': [1058, 1685, 1615, 1195, 1255, 1279, 1295, 2285, 1285, 1665],
'Label': [3, 3, 2, 1, 3, 4, 2, 2, 1, 4],
# 'E': [13, 127, 127, -1, -1, -1, -1, -1, -1, -1],
'Name': ['foo1','foo2','foo2','foo3','foo3','foo3','foo3','foo4','foo4','foo3'],
# 'F': [4, 4, 4, 3, 3, 3, 3, 3, 3, 3],
# 'G': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
# 'H': ['Friday','Friday','Friday','Thursday','Thursday','Thursday','Thursday','Wednesday','Thursday','Tuesday'],
# 'I': [213, 140, 210, 260, 200, 176, 160, 237, 100, 100],
# 'J': [16.758457907159716,7.671232876712329,11.506849315068493,17.869415807560138,13.745704467353955,12.096219931271474,10.996563573883162,9.397303727200637,7.220216606498194,5.6657223796034]
})
df.pivot(index='Name', columns=['Date', 'Label'], values='Price')`
I get
ValueError Traceback (most recent call last)
~\anaconda3\lib\site-packages\pandas\core\indexes\multi.py in _get_level_number(self, level)
1294 try:
-> 1295 level = self.names.index(level)
1296 except ValueError:
ValueError: 'Date' is not in list
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-17-542e5c02777d> in <module>
----> 1 df.pivot(index='Name', columns=['Date', 'Label'], values='Price')
~\anaconda3\lib\site-packages\pandas\core\frame.py in pivot(self, index, columns, values)
5921 from pandas.core.reshape.pivot import pivot
5922
-> 5923 return pivot(self, index=index, columns=columns, values=values)
5924
5925 _shared_docs[
~\anaconda3\lib\site-packages\pandas\core\reshape\pivot.py in pivot(data, index, columns, values)
448 else:
449 indexed = data._constructor_sliced(data[values].values, index=index)
--> 450 return indexed.unstack(columns)
451
452
~\anaconda3\lib\site-packages\pandas\core\series.py in unstack(self, level, fill_value)
3548 from pandas.core.reshape.reshape import unstack
3549
-> 3550 return unstack(self, level, fill_value)
3551
3552 # ----------------------------------------------------------------------
~\anaconda3\lib\site-packages\pandas\core\reshape\reshape.py in unstack(obj, level, fill_value)
396 # _unstack_multiple only handles MultiIndexes,
397 # and isn't needed for a single level
--> 398 return _unstack_multiple(obj, level, fill_value=fill_value)
399 else:
400 level = level[0]
~\anaconda3\lib\site-packages\pandas\core\reshape\reshape.py in _unstack_multiple(data, clocs, fill_value)
318 index = data.index
319
--> 320 clocs = [index._get_level_number(i) for i in clocs]
321
322 rlocs = [i for i in range(index.nlevels) if i not in clocs]
~\anaconda3\lib\site-packages\pandas\core\reshape\reshape.py in <listcomp>(.0)
318 index = data.index
319
--> 320 clocs = [index._get_level_number(i) for i in clocs]
321
322 rlocs = [i for i in range(index.nlevels) if i not in clocs]
~\anaconda3\lib\site-packages\pandas\core\indexes\multi.py in _get_level_number(self, level)
1296 except ValueError:
1297 if not is_integer(level):
-> 1298 raise KeyError(f"Level {level} not found")
1299 elif level < 0:
1300 level += self.nlevels
KeyError: 'Level Date not found'
Thanks!