import pandas as pdАгрегация данных
url = "https://github.com/dm-fedorov/pandas_basic/blob/master/%D1%83%D1%80%D0%BE%D0%BA%D0%B8_pandas/01-intro-to-pandas/data/raw/sample_sales.xlsx?raw=True"df = pd.read_excel(url)df.head()| invoice | company | purchase_date | product | quantity | price | extended amount | |
|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | shirt | 19 | 17 | 323 |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | book | 30 | 14 | 420 |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | poster | 7 | 23 | 161 |
| 3 | XP-005-55 | Skipfire | 2019-11-18 | pen | 7 | 29 | 203 |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | poster | 36 | 19 | 684 |
df \
.company \
.value_counts()company
Voomm 8
Dynabox 8
Thoughtstorm 8
Wikizz 8
Realcube 7
..
Centimia 1
Oloo 1
Youopia 1
Realmix 1
Omba 1
Name: count, Length: 351, dtype: int64
df \
.company \
.value_counts() \
.idxmax()'Voomm'
df['price'].describe()count 1000.000000
mean 22.816000
std 7.537039
min 10.000000
25% 16.000000
50% 23.000000
75% 29.000000
max 35.000000
Name: price, dtype: float64
df['price'].mean()22.816
df['price'].max()35
df['price'].agg('mean') # df['price'].mean()22.816
df['price'].agg(['mean',
'std',
'min',
'max'])mean 22.816000
std 7.537039
min 10.000000
max 35.000000
Name: price, dtype: float64
df.agg('mean', numeric_only=True) # !quantity 22.421
price 22.816
extended amount 510.270
dtype: float64
df.agg(['max', 'mean']).T # !df.agg(['mean', 'max']).T # !agg_cols = {'quantity': 'sum',
'price': ['mean', 'std'],
'invoice': 'count',
'extended amount': 'sum'}
df \
.agg(agg_cols)| quantity | price | invoice | extended amount | |
|---|---|---|---|---|
| sum | 22421.0 | NaN | NaN | 510270.0 |
| mean | NaN | 22.816000 | NaN | NaN |
| std | NaN | 7.537039 | NaN | NaN |
| count | NaN | NaN | 1000.0 | NaN |
import numpy as npnp.nan # Nonenan
type(np.nan)float
df \
.agg(agg_cols) \
.fillna(0)| quantity | price | invoice | extended amount | |
|---|---|---|---|---|
| sum | 22421.0 | 0.000000 | 0.0 | 510270.0 |
| mean | 0.0 | 22.816000 | 0.0 | 0.0 |
| std | 0.0 | 7.537039 | 0.0 | 0.0 |
| count | 0.0 | 0.000000 | 1000.0 | 0.0 |
(
df
.agg(agg_cols)
.fillna(0)
)| quantity | price | invoice | extended amount | |
|---|---|---|---|---|
| sum | 22421.0 | 0.000000 | 0.0 | 510270.0 |
| mean | 0.0 | 22.816000 | 0.0 | 0.0 |
| std | 0.0 | 7.537039 | 0.0 | 0.0 |
| count | 0.0 | 0.000000 | 1000.0 | 0.0 |
df| invoice | company | purchase_date | product | quantity | price | extended amount | |
|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | shirt | 19 | 17 | 323 |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | book | 30 | 14 | 420 |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | poster | 7 | 23 | 161 |
| 3 | XP-005-55 | Skipfire | 2019-11-18 | pen | 7 | 29 | 203 |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | poster | 36 | 19 | 684 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 995 | ZM-628-88 | Viva | 2019-09-11 | pen | -5 | 34 | -170 |
| 996 | DQ-810-46 | Viva | 2019-09-05 | pen | 17 | 32 | 544 |
| 997 | RA-147-40 | Dabfeed | 2019-03-24 | poster | 17 | 34 | 578 |
| 998 | VT-754-54 | Photobean | 2019-12-30 | shirt | 15 | 18 | 270 |
| 999 | LS-463-74 | Mybuzz | 2019-11-12 | shirt | 24 | 34 | 816 |
1000 rows × 7 columns
df["product"]0 shirt
1 book
2 poster
3 pen
4 poster
...
995 pen
996 pen
997 poster
998 shirt
999 shirt
Name: product, Length: 1000, dtype: object
df \
.groupby(['product'])<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001EA1E2F3AD0>
(df
.groupby(['product'])
.sum()
)--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[27], line 3 1 (df 2 .groupby(['product']) ----> 3 .sum() 4 ) File ~\AppData\Local\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:3146, in GroupBy.sum(self, numeric_only, min_count, engine, engine_kwargs) 3141 else: 3142 # If we are grouping on categoricals we want unobserved categories to 3143 # return zero, rather than the default of NaN which the reindexing in 3144 # _agg_general() returns. GH #31422 3145 with com.temp_setattr(self, "observed", True): -> 3146 result = self._agg_general( 3147 numeric_only=numeric_only, 3148 min_count=min_count, 3149 alias="sum", 3150 npfunc=np.sum, 3151 ) 3153 return self._reindex_output(result, fill_value=0) File ~\AppData\Local\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1906, in GroupBy._agg_general(self, numeric_only, min_count, alias, npfunc, **kwargs) 1896 @final 1897 def _agg_general( 1898 self, (...) 1904 **kwargs, 1905 ): -> 1906 result = self._cython_agg_general( 1907 how=alias, 1908 alt=npfunc, 1909 numeric_only=numeric_only, 1910 min_count=min_count, 1911 **kwargs, 1912 ) 1913 return result.__finalize__(self.obj, method="groupby") File ~\AppData\Local\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1998, in GroupBy._cython_agg_general(self, how, alt, numeric_only, min_count, **kwargs) 1995 result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt) 1996 return result -> 1998 new_mgr = data.grouped_reduce(array_func) 1999 res = self._wrap_agged_manager(new_mgr) 2000 if how in ["idxmin", "idxmax"]: File ~\AppData\Local\anaconda3\Lib\site-packages\pandas\core\internals\managers.py:1472, in BlockManager.grouped_reduce(self, func) 1470 result_blocks = extend_blocks(applied, result_blocks) 1471 else: -> 1472 applied = blk.apply(func) 1473 result_blocks = extend_blocks(applied, result_blocks) 1475 if len(result_blocks) == 0: File ~\AppData\Local\anaconda3\Lib\site-packages\pandas\core\internals\blocks.py:393, in Block.apply(self, func, **kwargs) 387 @final 388 def apply(self, func, **kwargs) -> list[Block]: 389 """ 390 apply the function to my values; return a block if we are not 391 one 392 """ --> 393 result = func(self.values, **kwargs) 395 result = maybe_coerce_values(result) 396 return self._split_op_result(result) File ~\AppData\Local\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1973, in GroupBy._cython_agg_general.<locals>.array_func(values) 1971 def array_func(values: ArrayLike) -> ArrayLike: 1972 try: -> 1973 result = self._grouper._cython_operation( 1974 "aggregate", 1975 values, 1976 how, 1977 axis=data.ndim - 1, 1978 min_count=min_count, 1979 **kwargs, 1980 ) 1981 except NotImplementedError: 1982 # generally if we have numeric_only=False 1983 # and non-applicable functions 1984 # try to python agg 1985 # TODO: shouldn't min_count matter? 1986 # TODO: avoid special casing SparseArray here 1987 if how in ["any", "all"] and isinstance(values, SparseArray): File ~\AppData\Local\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:831, in BaseGrouper._cython_operation(self, kind, values, how, axis, min_count, **kwargs) 829 ids, _, _ = self.group_info 830 ngroups = self.ngroups --> 831 return cy_op.cython_operation( 832 values=values, 833 axis=axis, 834 min_count=min_count, 835 comp_ids=ids, 836 ngroups=ngroups, 837 **kwargs, 838 ) File ~\AppData\Local\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:541, in WrappedCythonOp.cython_operation(self, values, axis, min_count, comp_ids, ngroups, **kwargs) 537 self._validate_axis(axis, values) 539 if not isinstance(values, np.ndarray): 540 # i.e. ExtensionArray --> 541 return values._groupby_op( 542 how=self.how, 543 has_dropped_na=self.has_dropped_na, 544 min_count=min_count, 545 ngroups=ngroups, 546 ids=comp_ids, 547 **kwargs, 548 ) 550 return self._cython_op_ndim_compat( 551 values, 552 min_count=min_count, (...) 556 **kwargs, 557 ) File ~\AppData\Local\anaconda3\Lib\site-packages\pandas\core\arrays\datetimelike.py:1671, in DatetimeLikeArrayMixin._groupby_op(self, how, has_dropped_na, min_count, ngroups, ids, **kwargs) 1668 if dtype.kind == "M": 1669 # Adding/multiplying datetimes is not valid 1670 if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]: -> 1671 raise TypeError(f"datetime64 type does not support {how} operations") 1672 if how in ["any", "all"]: 1673 # GH#34479 1674 warnings.warn( 1675 f"'{how}' with datetime64 dtypes is deprecated and will raise in a " 1676 f"future version. Use (obj != pd.Timestamp(0)).{how}() instead.", 1677 FutureWarning, 1678 stacklevel=find_stack_level(), 1679 ) TypeError: datetime64 type does not support sum operations
df \
.groupby('product')['quantity'](df
.groupby(['product'])['quantity'] # плохо
.count()
.sort_values()
)df['product'].value_counts()prod_cols = {'quantity': 'sum'}
df \
.groupby('product') \
.agg(prod_cols) \
.reset_index()prod_cols = {'quantity': 'sum'}
(df
.groupby(['product'])
.agg(prod_cols)
.reset_index()
)prod_cols = {'quantity':
['sum', 'mean', 'std', 'max']}
(df
.groupby(['product'])
.agg(prod_cols)
)prod_cols = {'quantity':
['sum', 'mean', 'std', 'max', 'count']}
(
df
.groupby(['company', 'product'])
.agg(prod_cols)
)df_gr = (df
.groupby(['company', 'product'])
.agg(prod_cols)
.fillna(0)
)
#df_gr
#df_gr.xs("Abatz", drop_level=False)
#df_gr.xs("Abatz")
#df_gr["quantity"]["sum"].values[:4]
#df_gr["quantity"]["sum"].idxmax()(df
.groupby(['company', 'product'])
.agg(prod_cols)
.reset_index()
)(df
.groupby(['company'])
.agg({'invoice': 'count',
'extended amount': 'max'})
.rename(columns={"invoice": "invoice_total",
'extended amount': "max_purchase"})
)(df
.groupby(['company'])
.agg(invoice_total=('invoice', 'count'),
max_purchase=('extended amount', 'max'))
)(df
.groupby(['company'])
.agg({'invoice': 'count',
'extended amount': 'max'})
)Pivot table and crosstab
df.head()pd.pivot_table(df,
index=['company'],
columns=['product'],
values=['extended amount'],
aggfunc='sum')pd.pivot_table(df,
index=['company'],
columns=['product'],
values=['extended amount'],
aggfunc='sum',
margins=True, # итог All
fill_value=0) # заполняет нулямиpd.pivot_table(df,
index=['company'],
columns=['product'],
values=['extended amount'],
aggfunc=['sum', 'mean', 'max'],
margins=True,
fill_value=0)pd.pivot_table(df,
index=['company', 'product'],
values=['extended amount'],
aggfunc=['sum'],
margins=True,
fill_value=0)pd.crosstab(df['company'],
df['product']) # считает кол-во элементов по умолчаниюpd.crosstab(df['company'],
df['product'],
values=df['extended amount'], # что считаем
aggfunc='sum', # как считаем
normalize='index') # показывает процент от общих покупокpd.crosstab(df['company'],
df['product'],
values=df['extended amount'], # что считаем
aggfunc='sum', # как считаем
normalize='columns') # показывает процент от общих покупок