Editing the question, to provide more details in the text form.
So I have DF :
print (a)
Year City occurrence
0 1905 New York 1
1 1921 New York 1
2 1922 New York 1
3 1923 New York 1
4 1927 New York 1
5 1927 New York 1
6 1928 New York 1
7 1928 New York 1
8 1932 New York 1
9 1933 New York 1
10 1933 New York 1
11 1934 New York 1
12 1936 New York 1
13 1937 New York 1
14 1938 New York 1
15 1938 New York 1
16 1939 New York 1
17 1940 New York 1
18 1941 New York 1
19 1943 New York 1
20 1947 New York 1
21 1949 New York 1
22 1950 New York 1
23 1951 New York 1
24 1952 New York 1
25 1953 New York 1
26 1954 New York 1
27 1955 New York 1
28 1956 New York 1
29 1956 New York 1
30 1958 New York 1
31 1961 New York 1
32 1962 New York 1
33 1968 New York 1
34 1969 New York 1
35 1970 New York 1
36 1973 New York 1
37 1977 New York 1
38 1978 New York 1
39 1980 New York 1
40 1981 New York 1
41 1982 New York 1
42 1983 New York 1
43 1986 New York 1
44 1986 New York 1
45 1990 New York 1
46 1994 New York 1
47 1995 New York 1
48 1996 New York 1
49 1998 New York 1
50 1999 New York 1
51 2000 New York 1
52 2000 New York 1
53 2003 New York 1
54 2007 New York 1
55 2009 New York 1
56 2011 New York 1
57 2021 New York 1
I want to do simple group by & count, and add new column with count. With SQL I would just do simple
select Year, City, count(*) from t1 group by 1,2
But I'm having trouble to do that.
I tried this:
df['cnt'] = df.groupby(['Year', 'City']).cumcount()+1
But I get something like this:
4 1927 New York 1
5 1927 New York 2
While obviously, I would expect to get only 1 row with value 2.
Any ideas?
Additionally, do I need column "Occurrence", which I added manually. Can I just count without it, like in SQL?
As suggested by @mozway I tried:
a['Cnt'] = a.groupby(['Year', 'City'])['occurrence'].sum()
But I get the error:
ValueError Traceback (most recent call last)
File ~/Library/Python/3.8/lib/python/site-packages/pandas/core/frame.py:10775, in _reindex_for_setitem(value, index)
10774 try:
> 10775 reindexed_value = value.reindex(index)._values
10776 except ValueError as err:
10777 # raised in MultiIndex.from_tuples, see test_insert_error_msmgs
File ~/Library/Python/3.8/lib/python/site-packages/pandas/core/series.py:4580, in Series.reindex(self, index, **kwargs)
4572 @doc(
4573 NDFrame.reindex, # type: ignore[has-type]
4574 klass=_shared_doc_kwargs["klass"],
(...)
4578 )
4579 def reindex(self, index=None, **kwargs):
-> 4580 return super().reindex(index=index, **kwargs)
File ~/Library/Python/3.8/lib/python/site-packages/pandas/core/generic.py:4818, in NDFrame.reindex(self, *args, **kwargs)
4817 # perform the reindex on the axes
-> 4818 return self._reindex_axes(
4819 axes, level, limit, tolerance, method, fill_value, copy
4820 ).__finalize__(self, method="reindex")
File ~/Library/Python/3.8/lib/python/site-packages/pandas/core/generic.py:4834, in NDFrame._reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
4833 ax = self._get_axis(a)
-> 4834 new_index, indexer = ax.reindex(
4835 labels, level=level, limit=limit, tolerance=tolerance, method=method
4836 )
4838 axis = self._get_axis_number(a)
File ~/Library/Python/3.8/lib/python/site-packages/pandas/core/indexes/multi.py:2533, in MultiIndex.reindex(self, target, method, level, limit, tolerance)
2532 try:
-> 2533 target = MultiIndex.from_tuples(target)
2534 except TypeError:
2535 # not all tuples, see test_constructor_dict_multiindex_reindex_flat
File ~/Library/Python/3.8/lib/python/site-packages/pandas/core/indexes/multi.py:202, in names_compat.<locals>.new_meth(self_or_cls, *args, **kwargs)
200 kwargs["names"] = kwargs.pop("name")
--> 202 return meth(self_or_cls, *args, **kwargs)
File ~/Library/Python/3.8/lib/python/site-packages/pandas/core/indexes/multi.py:553, in MultiIndex.from_tuples(cls, tuples, sortorder, names)
551 tuples = np.asarray(tuples._values)
--> 553 arrays = list(lib.tuples_to_object_array(tuples).T)
554 elif isinstance(tuples, list):
File ~/Library/Python/3.8/lib/python/site-packages/pandas/_libs/lib.pyx:2919, in pandas._libs.lib.tuples_to_object_array()
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
Input In [139], in <module>
----> 1 a['Cnt'] = a.groupby(['Year', 'City'])['occurrence'].sum()
File ~/Library/Python/3.8/lib/python/site-packages/pandas/core/frame.py:3612, in DataFrame.__setitem__(self, key, value)
3609 self._setitem_array([key], value)
3610 else:
3611 # set column
-> 3612 self._set_item(key, value)
File ~/Library/Python/3.8/lib/python/site-packages/pandas/core/frame.py:3784, in DataFrame._set_item(self, key, value)
3774 def _set_item(self, key, value) -> None:
3775 """
3776 Add series to DataFrame in specified column.
3777
(...)
3782 ensure homogeneity.
3783 """
-> 3784 value = self._sanitize_column(value)
3786 if (
3787 key in self.columns
3788 and value.ndim == 1
3789 and not is_extension_array_dtype(value)
3790 ):
3791 # broadcast across multiple columns if necessary
3792 if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
File ~/Library/Python/3.8/lib/python/site-packages/pandas/core/frame.py:4506, in DataFrame._sanitize_column(self, value)
4504 # We should never get here with DataFrame value
4505 if isinstance(value, Series):
-> 4506 return _reindex_for_setitem(value, self.index)
4508 if is_list_like(value):
4509 com.require_length_match(value, self.index)
File ~/Library/Python/3.8/lib/python/site-packages/pandas/core/frame.py:10782, in _reindex_for_setitem(value, index)
10778 if not value.index.is_unique:
10779 # duplicate axis
10780 raise err
> 10782 raise TypeError(
10783 "incompatible index of inserted column with frame index"
10784 ) from err
10785 return reindexed_value
TypeError: incompatible index of inserted column with frame index