Кейс про анализ продаж 2

import pandas as pd
url = "https://github.com/dm-fedorov/pandas_basic/blob/master/%D1%83%D1%80%D0%BE%D0%BA%D0%B8_pandas/02-data-wrangling/data/raw/sample_sales_details.xlsx?raw=True"
df = pd.read_excel(url)
type(df)
pandas.core.frame.DataFrame
df.head(10)
invoice company purchase_date sku product quantity price extended amount shipping_cost
0 ZN-870-29 Realcube 2019-03-05 L-Y-71 shirt 19 17 323 6.46
1 JQ-501-63 Zooxo 2019-07-09 E-532 book 30 14 420 8.40
2 FI-165-58 Dabtype 2019-08-12 S-909 poster 7 23 161 3.22
3 XP-005-55 Skipfire 2019-11-18 B-B-5 pen 7 29 203 4.06
4 NB-917-18 Bluezoom 2019-04-18 L-477 poster 36 19 684 13.68
5 MI-696-11 Zooveo 2019-10-17 B-BK-5 pen -1 30 -30 0.00
6 MQ-907-02 Babbleset 2019-10-27 L-579 poster 30 21 630 12.60
7 NX-102-26 Fliptune 2019-10-16 E-201 book 40 28 1120 22.40
8 LE-516-00 Buzzbean 2019-06-17 S-393 poster -3 16 -48 0.00
9 VD-518-20 Dabshots 2019-03-12 XL-B-12 shirt 19 28 532 10.64
df.info() # to_datetime()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   invoice          1000 non-null   object        
 1   company          1000 non-null   object        
 2   purchase_date    1000 non-null   datetime64[ns]
 3   sku              1000 non-null   object        
 4   product          1000 non-null   object        
 5   quantity         1000 non-null   int64         
 6   price            1000 non-null   int64         
 7   extended amount  1000 non-null   int64         
 8   shipping_cost    1000 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(3), object(4)
memory usage: 70.4+ KB
df.index
RangeIndex(start=0, stop=1000, step=1)
df.columns
Index(['invoice', 'company', 'purchase_date', 'sku', 'product', 'quantity',
       'price', 'extended amount', 'shipping_cost'],
      dtype='object')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   invoice          1000 non-null   object        
 1   company          1000 non-null   object        
 2   purchase_date    1000 non-null   datetime64[ns]
 3   sku              1000 non-null   object        
 4   product          1000 non-null   object        
 5   quantity         1000 non-null   int64         
 6   price            1000 non-null   int64         
 7   extended amount  1000 non-null   int64         
 8   shipping_cost    1000 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(3), object(4)
memory usage: 70.4+ KB
df['purchase_date']
0     2019-03-05
1     2019-07-09
2     2019-08-12
3     2019-11-18
4     2019-04-18
         ...    
995   2019-09-11
996   2019-09-05
997   2019-03-24
998   2019-12-30
999   2019-11-12
Name: purchase_date, Length: 1000, dtype: datetime64[ns]
df['purchase_date'].index
RangeIndex(start=0, stop=1000, step=1)
df['purchase_date'].values[0]
numpy.datetime64('2019-03-05T00:00:00.000000000')
type(df['purchase_date'])
pandas.core.series.Series
df.purchase_date
0     2019-03-05
1     2019-07-09
2     2019-08-12
3     2019-11-18
4     2019-04-18
         ...    
995   2019-09-11
996   2019-09-05
997   2019-03-24
998   2019-12-30
999   2019-11-12
Name: purchase_date, Length: 1000, dtype: datetime64[ns]
df.purchase_date.dt.year
0      2019
1      2019
2      2019
3      2019
4      2019
       ... 
995    2019
996    2019
997    2019
998    2019
999    2019
Name: purchase_date, Length: 1000, dtype: int32
df.purchase_date.dt.year
0      2019
1      2019
2      2019
3      2019
4      2019
       ... 
995    2019
996    2019
997    2019
998    2019
999    2019
Name: purchase_date, Length: 1000, dtype: int32
df['purchase_date'].dt
<pandas.core.indexes.accessors.DatetimeProperties object at 0x0000027BB0859DC0>
#@title
df['purchase_date'].dt.month # аксессор
0       3
1       7
2       8
3      11
4       4
       ..
995     9
996     9
997     3
998    12
999    11
Name: purchase_date, Length: 1000, dtype: int32
#@title
df['purchase_date'].dt.year
0      2019
1      2019
2      2019
3      2019
4      2019
       ... 
995    2019
996    2019
997    2019
998    2019
999    2019
Name: purchase_date, Length: 1000, dtype: int32
df['purchase_date'].dt.dayofweek # https://pandas.pydata.org/docs/reference/api/pandas.Series.dt.weekday.html
0      1
1      1
2      0
3      0
4      3
      ..
995    2
996    3
997    6
998    0
999    1
Name: purchase_date, Length: 1000, dtype: int32
df['purchase_date'].dt.quarter # квартал
0      1
1      3
2      3
3      4
4      2
      ..
995    3
996    3
997    1
998    4
999    4
Name: purchase_date, Length: 1000, dtype: int32
df['purchase_date'].dt.days_in_month
0      31
1      31
2      31
3      30
4      30
       ..
995    30
996    30
997    31
998    31
999    30
Name: purchase_date, Length: 1000, dtype: int32
df['purchase_date'].dt.is_month_end
0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999    False
Name: purchase_date, Length: 1000, dtype: bool
df.head() # изменений не произошло
invoice company purchase_date sku product quantity price extended amount shipping_cost
0 ZN-870-29 Realcube 2019-03-05 L-Y-71 shirt 19 17 323 6.46
1 JQ-501-63 Zooxo 2019-07-09 E-532 book 30 14 420 8.40
2 FI-165-58 Dabtype 2019-08-12 S-909 poster 7 23 161 3.22
3 XP-005-55 Skipfire 2019-11-18 B-B-5 pen 7 29 203 4.06
4 NB-917-18 Bluezoom 2019-04-18 L-477 poster 36 19 684 13.68
# добавить в качестве столбцов таблицы
df['purchase_month'] = df['purchase_date'].dt.month
df['purchase_year'] = df['purchase_date'].dt.year
df['purchase_qrtr'] = df['purchase_date'].dt.quarter
df.head()
invoice company purchase_date sku product quantity price extended amount shipping_cost purchase_month purchase_year purchase_qrtr
0 ZN-870-29 Realcube 2019-03-05 L-Y-71 shirt 19 17 323 6.46 3 2019 1
1 JQ-501-63 Zooxo 2019-07-09 E-532 book 30 14 420 8.40 7 2019 3
2 FI-165-58 Dabtype 2019-08-12 S-909 poster 7 23 161 3.22 8 2019 3
3 XP-005-55 Skipfire 2019-11-18 B-B-5 pen 7 29 203 4.06 11 2019 4
4 NB-917-18 Bluezoom 2019-04-18 L-477 poster 36 19 684 13.68 4 2019 2
df['purchase_year'] > 2018
0      True
1      True
2      True
3      True
4      True
       ... 
995    True
996    True
997    True
998    True
999    True
Name: purchase_year, Length: 1000, dtype: bool
df[df['purchase_year'] > 2018]
invoice company purchase_date sku product quantity price extended amount shipping_cost purchase_month purchase_year purchase_qrtr
0 ZN-870-29 Realcube 2019-03-05 L-Y-71 shirt 19 17 323 6.46 3 2019 1
1 JQ-501-63 Zooxo 2019-07-09 E-532 book 30 14 420 8.40 7 2019 3
2 FI-165-58 Dabtype 2019-08-12 S-909 poster 7 23 161 3.22 8 2019 3
3 XP-005-55 Skipfire 2019-11-18 B-B-5 pen 7 29 203 4.06 11 2019 4
4 NB-917-18 Bluezoom 2019-04-18 L-477 poster 36 19 684 13.68 4 2019 2
... ... ... ... ... ... ... ... ... ... ... ... ...
995 ZM-628-88 Viva 2019-09-11 B-B-5 pen -5 34 -170 0.00 9 2019 3
996 DQ-810-46 Viva 2019-09-05 M-B-5 pen 17 32 544 10.88 9 2019 3
997 RA-147-40 Dabfeed 2019-03-24 M-809 poster 17 34 578 11.56 3 2019 1
998 VT-754-54 Photobean 2019-12-30 L-G-31 shirt 15 18 270 5.40 12 2019 4
999 LS-463-74 Mybuzz 2019-11-12 S-G-73 shirt 24 34 816 16.32 11 2019 4

1000 rows × 12 columns

df[df['purchase_year'] > 2018].head(3)
invoice company purchase_date sku product quantity price extended amount shipping_cost purchase_month purchase_year purchase_qrtr
0 ZN-870-29 Realcube 2019-03-05 L-Y-71 shirt 19 17 323 6.46 3 2019 1
1 JQ-501-63 Zooxo 2019-07-09 E-532 book 30 14 420 8.40 7 2019 3
2 FI-165-58 Dabtype 2019-08-12 S-909 poster 7 23 161 3.22 8 2019 3
df.query("purchase_year > 2018").head(3)
invoice company purchase_date sku product quantity price extended amount shipping_cost purchase_month purchase_year purchase_qrtr
0 ZN-870-29 Realcube 2019-03-05 L-Y-71 shirt 19 17 323 6.46 3 2019 1
1 JQ-501-63 Zooxo 2019-07-09 E-532 book 30 14 420 8.40 7 2019 3
2 FI-165-58 Dabtype 2019-08-12 S-909 poster 7 23 161 3.22 8 2019 3

Working with strings

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   invoice          1000 non-null   object        
 1   company          1000 non-null   object        
 2   purchase_date    1000 non-null   datetime64[ns]
 3   sku              1000 non-null   object        
 4   product          1000 non-null   object        
 5   quantity         1000 non-null   int64         
 6   price            1000 non-null   int64         
 7   extended amount  1000 non-null   int64         
 8   shipping_cost    1000 non-null   float64       
 9   purchase_month   1000 non-null   int32         
 10  purchase_year    1000 non-null   int32         
 11  purchase_qrtr    1000 non-null   int32         
dtypes: datetime64[ns](1), float64(1), int32(3), int64(3), object(4)
memory usage: 82.2+ KB
df['company']
0       Realcube
1          Zooxo
2        Dabtype
3       Skipfire
4       Bluezoom
         ...    
995         Viva
996         Viva
997      Dabfeed
998    Photobean
999       Mybuzz
Name: company, Length: 1000, dtype: object
df.company
0       Realcube
1          Zooxo
2        Dabtype
3       Skipfire
4       Bluezoom
         ...    
995         Viva
996         Viva
997      Dabfeed
998    Photobean
999       Mybuzz
Name: company, Length: 1000, dtype: object
"hello".replace('e', 'll')
'hllllo'
df.company
0       Realcube
1          Zooxo
2        Dabtype
3       Skipfire
4       Bluezoom
         ...    
995         Viva
996         Viva
997      Dabfeed
998    Photobean
999       Mybuzz
Name: company, Length: 1000, dtype: object
df.company.str.
df.company.str.lower()
0       realcube
1          zooxo
2        dabtype
3       skipfire
4       bluezoom
         ...    
995         viva
996         viva
997      dabfeed
998    photobean
999       mybuzz
Name: company, Length: 1000, dtype: object
df.company.str.replace('Vi', 'va')
0       Realcube
1          Zooxo
2        Dabtype
3       Skipfire
4       Bluezoom
         ...    
995         vava
996         vava
997      Dabfeed
998    Photobean
999       Mybuzz
Name: company, Length: 1000, dtype: object
df['company'].str.upper()
0       REALCUBE
1          ZOOXO
2        DABTYPE
3       SKIPFIRE
4       BLUEZOOM
         ...    
995         VIVA
996         VIVA
997      DABFEED
998    PHOTOBEAN
999       MYBUZZ
Name: company, Length: 1000, dtype: object
'hello'.upper()
'HELLO'
df['company'].str.lower()
0       realcube
1          zooxo
2        dabtype
3       skipfire
4       bluezoom
         ...    
995         viva
996         viva
997      dabfeed
998    photobean
999       mybuzz
Name: company, Length: 1000, dtype: object
df['company'].str.title()
0       Realcube
1          Zooxo
2        Dabtype
3       Skipfire
4       Bluezoom
         ...    
995         Viva
996         Viva
997      Dabfeed
998    Photobean
999       Mybuzz
Name: company, Length: 1000, dtype: object
df['company'].str.len()
0      8
1      5
2      7
3      8
4      8
      ..
995    4
996    4
997    7
998    9
999    6
Name: company, Length: 1000, dtype: int64
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
                       'Eric Idle', 'Terry Jones', 'Michael Palin'])
monte
0    Graham Chapman
1       John Cleese
2     Terry Gilliam
3         Eric Idle
4       Terry Jones
5     Michael Palin
dtype: object
monte.str.split()
0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object
monte.str.split().str.get(1)
0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object
import seaborn as sns
sns.set_theme(style="ticks")

df['company'].str.len().plot(kind='hist');

df.company.str.replace('a', 'b')
0       Reblcube
1          Zooxo
2        Dbbtype
3       Skipfire
4       Bluezoom
         ...    
995         Vivb
996         Vivb
997      Dbbfeed
998    Photobebn
999       Mybuzz
Name: company, Length: 1000, dtype: object
df['upper_company'] = df['company'].str.upper()
df.head()
invoice company purchase_date sku product quantity price extended amount shipping_cost purchase_month purchase_year purchase_qrtr upper_company
0 ZN-870-29 Realcube 2019-03-05 L-Y-71 shirt 19 17 323 6.46 3 2019 1 REALCUBE
1 JQ-501-63 Zooxo 2019-07-09 E-532 book 30 14 420 8.40 7 2019 3 ZOOXO
2 FI-165-58 Dabtype 2019-08-12 S-909 poster 7 23 161 3.22 8 2019 3 DABTYPE
3 XP-005-55 Skipfire 2019-11-18 B-B-5 pen 7 29 203 4.06 11 2019 4 SKIPFIRE
4 NB-917-18 Bluezoom 2019-04-18 L-477 poster 36 19 684 13.68 4 2019 2 BLUEZOOM
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   invoice          1000 non-null   object        
 1   company          1000 non-null   object        
 2   purchase_date    1000 non-null   datetime64[ns]
 3   sku              1000 non-null   object        
 4   product          1000 non-null   object        
 5   quantity         1000 non-null   int64         
 6   price            1000 non-null   int64         
 7   extended amount  1000 non-null   int64         
 8   shipping_cost    1000 non-null   float64       
 9   purchase_month   1000 non-null   int32         
 10  purchase_year    1000 non-null   int32         
 11  purchase_qrtr    1000 non-null   int32         
 12  upper_company    1000 non-null   object        
dtypes: datetime64[ns](1), float64(1), int32(3), int64(3), object(5)
memory usage: 90.0+ KB
df['extended amount'] * .9
0      290.7
1      378.0
2      144.9
3      182.7
4      615.6
       ...  
995   -153.0
996    489.6
997    520.2
998    243.0
999    734.4
Name: extended amount, Length: 1000, dtype: float64
df['extended amount'].mul(.9)
0      290.7
1      378.0
2      144.9
3      182.7
4      615.6
       ...  
995   -153.0
996    489.6
997    520.2
998    243.0
999    734.4
Name: extended amount, Length: 1000, dtype: float64
df['extended amount']
0      323
1      420
2      161
3      203
4      684
      ... 
995   -170
996    544
997    578
998    270
999    816
Name: extended amount, Length: 1000, dtype: int64
df['new_price'] = 1.05 * df['price']
df['new_extended_amount'] = df['new_price'] * df['quantity']
df.head()
invoice company purchase_date sku product quantity price extended amount shipping_cost purchase_month purchase_year purchase_qrtr upper_company new_price new_extended_amount
0 ZN-870-29 Realcube 2019-03-05 L-Y-71 shirt 19 17 323 6.46 3 2019 1 REALCUBE 17.85 339.15
1 JQ-501-63 Zooxo 2019-07-09 E-532 book 30 14 420 8.40 7 2019 3 ZOOXO 14.70 441.00
2 FI-165-58 Dabtype 2019-08-12 S-909 poster 7 23 161 3.22 8 2019 3 DABTYPE 24.15 169.05
3 XP-005-55 Skipfire 2019-11-18 B-B-5 pen 7 29 203 4.06 11 2019 4 SKIPFIRE 30.45 213.15
4 NB-917-18 Bluezoom 2019-04-18 L-477 poster 36 19 684 13.68 4 2019 2 BLUEZOOM 19.95 718.20
df[['extended amount', 'new_extended_amount']].min()
extended amount       -170.0
new_extended_amount   -178.5
dtype: float64
df[['extended amount', 'new_extended_amount']].describe().T
count mean std min 25% 50% 75% max
extended amount 1000.0 510.2700 426.411667 -170.0 168.7500 435.00 798.500 1715.00
new_extended_amount 1000.0 535.7835 447.732251 -178.5 177.1875 456.75 838.425 1800.75