Агрегация данных

import pandas as pd
url = "https://github.com/dm-fedorov/pandas_basic/blob/master/%D1%83%D1%80%D0%BE%D0%BA%D0%B8_pandas/01-intro-to-pandas/data/raw/sample_sales.xlsx?raw=True"
df = pd.read_excel(url)
df.head()
invoice company purchase_date product quantity price extended amount
0 ZN-870-29 Realcube 2019-03-05 shirt 19 17 323
1 JQ-501-63 Zooxo 2019-07-09 book 30 14 420
2 FI-165-58 Dabtype 2019-08-12 poster 7 23 161
3 XP-005-55 Skipfire 2019-11-18 pen 7 29 203
4 NB-917-18 Bluezoom 2019-04-18 poster 36 19 684
df \
    .company  \
    .value_counts()
company
Voomm           8
Dynabox         8
Thoughtstorm    8
Wikizz          8
Realcube        7
               ..
Centimia        1
Oloo            1
Youopia         1
Realmix         1
Omba            1
Name: count, Length: 351, dtype: int64
df \
    .company \
    .value_counts() \
    .idxmax()
'Voomm'
df['price'].describe()
count    1000.000000
mean       22.816000
std         7.537039
min        10.000000
25%        16.000000
50%        23.000000
75%        29.000000
max        35.000000
Name: price, dtype: float64
df['price'].mean()
22.816
df['price'].max()
35
df['price'].agg('mean') # df['price'].mean()
22.816
df['price'].agg(['mean',
                 'std',
                 'min',
                 'max'])
mean    22.816000
std      7.537039
min     10.000000
max     35.000000
Name: price, dtype: float64
df.agg('mean', numeric_only=True) # !
quantity            22.421
price               22.816
extended amount    510.270
dtype: float64
df.agg(['max', 'mean']).T # !
df.agg(['mean', 'max']).T # !
agg_cols = {'quantity': 'sum',
            'price': ['mean', 'std'],
            'invoice': 'count',
            'extended amount': 'sum'}

df \
  .agg(agg_cols)
quantity price invoice extended amount
sum 22421.0 NaN NaN 510270.0
mean NaN 22.816000 NaN NaN
std NaN 7.537039 NaN NaN
count NaN NaN 1000.0 NaN
import numpy as np
np.nan # None
nan
type(np.nan)
float
df \
    .agg(agg_cols) \
    .fillna(0)
quantity price invoice extended amount
sum 22421.0 0.000000 0.0 510270.0
mean 0.0 22.816000 0.0 0.0
std 0.0 7.537039 0.0 0.0
count 0.0 0.000000 1000.0 0.0
(
    df
      .agg(agg_cols)
      .fillna(0)
)
quantity price invoice extended amount
sum 22421.0 0.000000 0.0 510270.0
mean 0.0 22.816000 0.0 0.0
std 0.0 7.537039 0.0 0.0
count 0.0 0.000000 1000.0 0.0
df
invoice company purchase_date product quantity price extended amount
0 ZN-870-29 Realcube 2019-03-05 shirt 19 17 323
1 JQ-501-63 Zooxo 2019-07-09 book 30 14 420
2 FI-165-58 Dabtype 2019-08-12 poster 7 23 161
3 XP-005-55 Skipfire 2019-11-18 pen 7 29 203
4 NB-917-18 Bluezoom 2019-04-18 poster 36 19 684
... ... ... ... ... ... ... ...
995 ZM-628-88 Viva 2019-09-11 pen -5 34 -170
996 DQ-810-46 Viva 2019-09-05 pen 17 32 544
997 RA-147-40 Dabfeed 2019-03-24 poster 17 34 578
998 VT-754-54 Photobean 2019-12-30 shirt 15 18 270
999 LS-463-74 Mybuzz 2019-11-12 shirt 24 34 816

1000 rows × 7 columns

df["product"]
0       shirt
1        book
2      poster
3         pen
4      poster
        ...  
995       pen
996       pen
997    poster
998     shirt
999     shirt
Name: product, Length: 1000, dtype: object
df \
 .groupby(['product'])
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001EA1E2F3AD0>
(df
 .groupby(['product'])
 .sum()
)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[27], line 3
      1 (df
      2  .groupby(['product'])
----> 3  .sum()
      4 )

File ~\AppData\Local\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:3146, in GroupBy.sum(self, numeric_only, min_count, engine, engine_kwargs)
   3141 else:
   3142     # If we are grouping on categoricals we want unobserved categories to
   3143     # return zero, rather than the default of NaN which the reindexing in
   3144     # _agg_general() returns. GH #31422
   3145     with com.temp_setattr(self, "observed", True):
-> 3146         result = self._agg_general(
   3147             numeric_only=numeric_only,
   3148             min_count=min_count,
   3149             alias="sum",
   3150             npfunc=np.sum,
   3151         )
   3153     return self._reindex_output(result, fill_value=0)

File ~\AppData\Local\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1906, in GroupBy._agg_general(self, numeric_only, min_count, alias, npfunc, **kwargs)
   1896 @final
   1897 def _agg_general(
   1898     self,
   (...)
   1904     **kwargs,
   1905 ):
-> 1906     result = self._cython_agg_general(
   1907         how=alias,
   1908         alt=npfunc,
   1909         numeric_only=numeric_only,
   1910         min_count=min_count,
   1911         **kwargs,
   1912     )
   1913     return result.__finalize__(self.obj, method="groupby")

File ~\AppData\Local\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1998, in GroupBy._cython_agg_general(self, how, alt, numeric_only, min_count, **kwargs)
   1995     result = self._agg_py_fallback(how, values, ndim=data.ndim, alt=alt)
   1996     return result
-> 1998 new_mgr = data.grouped_reduce(array_func)
   1999 res = self._wrap_agged_manager(new_mgr)
   2000 if how in ["idxmin", "idxmax"]:

File ~\AppData\Local\anaconda3\Lib\site-packages\pandas\core\internals\managers.py:1472, in BlockManager.grouped_reduce(self, func)
   1470             result_blocks = extend_blocks(applied, result_blocks)
   1471     else:
-> 1472         applied = blk.apply(func)
   1473         result_blocks = extend_blocks(applied, result_blocks)
   1475 if len(result_blocks) == 0:

File ~\AppData\Local\anaconda3\Lib\site-packages\pandas\core\internals\blocks.py:393, in Block.apply(self, func, **kwargs)
    387 @final
    388 def apply(self, func, **kwargs) -> list[Block]:
    389     """
    390     apply the function to my values; return a block if we are not
    391     one
    392     """
--> 393     result = func(self.values, **kwargs)
    395     result = maybe_coerce_values(result)
    396     return self._split_op_result(result)

File ~\AppData\Local\anaconda3\Lib\site-packages\pandas\core\groupby\groupby.py:1973, in GroupBy._cython_agg_general.<locals>.array_func(values)
   1971 def array_func(values: ArrayLike) -> ArrayLike:
   1972     try:
-> 1973         result = self._grouper._cython_operation(
   1974             "aggregate",
   1975             values,
   1976             how,
   1977             axis=data.ndim - 1,
   1978             min_count=min_count,
   1979             **kwargs,
   1980         )
   1981     except NotImplementedError:
   1982         # generally if we have numeric_only=False
   1983         # and non-applicable functions
   1984         # try to python agg
   1985         # TODO: shouldn't min_count matter?
   1986         # TODO: avoid special casing SparseArray here
   1987         if how in ["any", "all"] and isinstance(values, SparseArray):

File ~\AppData\Local\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:831, in BaseGrouper._cython_operation(self, kind, values, how, axis, min_count, **kwargs)
    829 ids, _, _ = self.group_info
    830 ngroups = self.ngroups
--> 831 return cy_op.cython_operation(
    832     values=values,
    833     axis=axis,
    834     min_count=min_count,
    835     comp_ids=ids,
    836     ngroups=ngroups,
    837     **kwargs,
    838 )

File ~\AppData\Local\anaconda3\Lib\site-packages\pandas\core\groupby\ops.py:541, in WrappedCythonOp.cython_operation(self, values, axis, min_count, comp_ids, ngroups, **kwargs)
    537 self._validate_axis(axis, values)
    539 if not isinstance(values, np.ndarray):
    540     # i.e. ExtensionArray
--> 541     return values._groupby_op(
    542         how=self.how,
    543         has_dropped_na=self.has_dropped_na,
    544         min_count=min_count,
    545         ngroups=ngroups,
    546         ids=comp_ids,
    547         **kwargs,
    548     )
    550 return self._cython_op_ndim_compat(
    551     values,
    552     min_count=min_count,
   (...)
    556     **kwargs,
    557 )

File ~\AppData\Local\anaconda3\Lib\site-packages\pandas\core\arrays\datetimelike.py:1671, in DatetimeLikeArrayMixin._groupby_op(self, how, has_dropped_na, min_count, ngroups, ids, **kwargs)
   1668 if dtype.kind == "M":
   1669     # Adding/multiplying datetimes is not valid
   1670     if how in ["sum", "prod", "cumsum", "cumprod", "var", "skew"]:
-> 1671         raise TypeError(f"datetime64 type does not support {how} operations")
   1672     if how in ["any", "all"]:
   1673         # GH#34479
   1674         warnings.warn(
   1675             f"'{how}' with datetime64 dtypes is deprecated and will raise in a "
   1676             f"future version. Use (obj != pd.Timestamp(0)).{how}() instead.",
   1677             FutureWarning,
   1678             stacklevel=find_stack_level(),
   1679         )

TypeError: datetime64 type does not support sum operations
df \
 .groupby('product')['quantity']
(df
 .groupby(['product'])['quantity'] # плохо
 .count()
 .sort_values()
)
df['product'].value_counts()
prod_cols = {'quantity': 'sum'}

df \
 .groupby('product') \
 .agg(prod_cols) \
 .reset_index()
prod_cols = {'quantity': 'sum'}

(df
 .groupby(['product'])
 .agg(prod_cols)
 .reset_index()
)
prod_cols = {'quantity':
             ['sum', 'mean', 'std', 'max']}

(df
 .groupby(['product'])
 .agg(prod_cols)
)
prod_cols = {'quantity':
             ['sum', 'mean', 'std', 'max', 'count']}

(
df
 .groupby(['company', 'product'])
 .agg(prod_cols)
)
df_gr = (df
          .groupby(['company', 'product'])
          .agg(prod_cols)
          .fillna(0)
          )

#df_gr
#df_gr.xs("Abatz", drop_level=False)
#df_gr.xs("Abatz")

#df_gr["quantity"]["sum"].values[:4]
#df_gr["quantity"]["sum"].idxmax()
(df
 .groupby(['company', 'product'])
 .agg(prod_cols)
 .reset_index()
)
(df
 .groupby(['company'])
 .agg({'invoice': 'count',
      'extended amount': 'max'})
 .rename(columns={"invoice": "invoice_total",
                  'extended amount': "max_purchase"})
)
(df
 .groupby(['company'])
 .agg(invoice_total=('invoice', 'count'),
      max_purchase=('extended amount', 'max'))
)
(df
 .groupby(['company'])
 .agg({'invoice': 'count',
       'extended amount': 'max'})
)

Pivot table and crosstab

df.head()
pd.pivot_table(df,
               index=['company'],
               columns=['product'],
               values=['extended amount'],
               aggfunc='sum')
pd.pivot_table(df,
               index=['company'],
               columns=['product'],
               values=['extended amount'],
               aggfunc='sum',
               margins=True, # итог All
               fill_value=0) # заполняет нулями
pd.pivot_table(df,
               index=['company'],
               columns=['product'],
               values=['extended amount'],
               aggfunc=['sum', 'mean', 'max'],
               margins=True,
               fill_value=0)
pd.pivot_table(df,
               index=['company', 'product'],
               values=['extended amount'],
               aggfunc=['sum'],
               margins=True,
               fill_value=0)
pd.crosstab(df['company'],
            df['product']) # считает кол-во элементов по умолчанию
pd.crosstab(df['company'],
            df['product'],
            values=df['extended amount'], # что считаем
            aggfunc='sum', # как считаем
            normalize='index') # показывает процент от общих покупок
pd.crosstab(df['company'],
            df['product'],
            values=df['extended amount'], # что считаем
            aggfunc='sum', # как считаем
            normalize='columns') # показывает процент от общих покупок