Кейс про анализ

import pandas as pd
url = "https://github.com/dm-fedorov/pandas_basic/blob/master/%D1%83%D1%80%D0%BE%D0%BA%D0%B8_pandas/02-data-wrangling/data/raw/sample_sales_details.xlsx?raw=True"
df = pd.read_excel(url)
df
invoice company purchase_date sku product quantity price extended amount shipping_cost
0 ZN-870-29 Realcube 2019-03-05 L-Y-71 shirt 19 17 323 6.46
1 JQ-501-63 Zooxo 2019-07-09 E-532 book 30 14 420 8.40
2 FI-165-58 Dabtype 2019-08-12 S-909 poster 7 23 161 3.22
3 XP-005-55 Skipfire 2019-11-18 B-B-5 pen 7 29 203 4.06
4 NB-917-18 Bluezoom 2019-04-18 L-477 poster 36 19 684 13.68
... ... ... ... ... ... ... ... ... ...
995 ZM-628-88 Viva 2019-09-11 B-B-5 pen -5 34 -170 0.00
996 DQ-810-46 Viva 2019-09-05 M-B-5 pen 17 32 544 10.88
997 RA-147-40 Dabfeed 2019-03-24 M-809 poster 17 34 578 11.56
998 VT-754-54 Photobean 2019-12-30 L-G-31 shirt 15 18 270 5.40
999 LS-463-74 Mybuzz 2019-11-12 S-G-73 shirt 24 34 816 16.32

1000 rows × 9 columns

df['company'] == 'Viva'
0      False
1      False
2      False
3      False
4      False
       ...  
995     True
996     True
997    False
998    False
999    False
Name: company, Length: 1000, dtype: bool
viva = (df['company'] == 'Viva')
df.loc[viva, ["company","quantity"]] # доп возможности фильтрации
company quantity
186 Viva 8
705 Viva 6
840 Viva 46
995 Viva -5
996 Viva 17
df[viva] # ~
invoice company purchase_date sku product quantity price extended amount shipping_cost
186 CM-205-86 Viva 2019-05-19 E-327 book 8 14 112 2.24
705 QU-986-45 Viva 2019-06-09 H-966 book 6 35 210 4.20
840 RF-796-61 Viva 2019-02-19 S-393 poster 46 16 736 14.72
995 ZM-628-88 Viva 2019-09-11 B-B-5 pen -5 34 -170 0.00
996 DQ-810-46 Viva 2019-09-05 M-B-5 pen 17 32 544 10.88
qty_10 = (df['quantity'] >= 10)
qty_10
0       True
1       True
2      False
3      False
4       True
       ...  
995    False
996     True
997     True
998     True
999     True
Name: quantity, Length: 1000, dtype: bool
df[(qty_10 & viva)]
invoice company purchase_date sku product quantity price extended amount shipping_cost
840 RF-796-61 Viva 2019-02-19 S-393 poster 46 16 736 14.72
996 DQ-810-46 Viva 2019-09-05 M-B-5 pen 17 32 544 10.88
df.loc[(qty_10 & viva), :]
invoice company purchase_date sku product quantity price extended amount shipping_cost
840 RF-796-61 Viva 2019-02-19 S-393 poster 46 16 736 14.72
996 DQ-810-46 Viva 2019-09-05 M-B-5 pen 17 32 544 10.88
df.loc[(qty_10 & viva) , 'purchase_date':'price']
purchase_date sku product quantity price
840 2019-02-19 S-393 poster 46 16
996 2019-09-05 M-B-5 pen 17 32
df['company'].str.contains('buzz')
0      False
1      False
2      False
3      False
4      False
       ...  
995    False
996    False
997    False
998    False
999     True
Name: company, Length: 1000, dtype: bool
'p' in "python"
True
buzz = df['company'].str.contains('buzz')
df[buzz]
invoice company purchase_date sku product quantity price extended amount shipping_cost
23 NE-854-88 Realbuzz 2019-05-08 M-271 poster 11 14 154 3.08
48 NM-907-40 Mybuzz 2019-11-29 S-500 poster 11 26 286 5.72
153 KW-456-55 Realbuzz 2019-05-21 S-393 poster 38 22 836 16.72
206 GV-236-09 Linkbuzz 2019-06-25 M-809 poster 49 18 882 17.64
326 EY-555-15 Realbuzz 2019-04-18 XL-B-12 shirt 0 12 0 0.00
470 SW-903-99 Realbuzz 2019-07-30 L-477 poster 6 29 174 3.48
589 HS-879-34 Mybuzz 2019-02-15 L-743 poster 36 30 1080 21.60
687 LU-384-08 Linkbuzz 2019-01-15 S-B-41 shirt 35 35 1225 24.50
947 YW-362-87 Linkbuzz 2019-12-11 P-529 book 45 17 765 15.30
999 LS-463-74 Mybuzz 2019-11-12 S-G-73 shirt 24 34 816 16.32
df.loc[buzz, ["sku", "product"]]
sku product
23 M-271 poster
48 S-500 poster
153 S-393 poster
206 M-809 poster
326 XL-B-12 shirt
470 L-477 poster
589 L-743 poster
687 S-B-41 shirt
947 P-529 book
999 S-G-73 shirt
df.loc[df['sku'].str.startswith('S'), 'product']
2      poster
8      poster
14     poster
34     poster
40     poster
        ...  
975     shirt
982     shirt
992    poster
993    poster
999     shirt
Name: product, Length: 229, dtype: object
df \
    .loc[df['sku'].str.startswith('S'), 'product']\
    .value_counts(normalize=True)
product
shirt     0.532751
poster    0.467249
Name: proportion, dtype: float64

Working with dates

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   invoice          1000 non-null   object        
 1   company          1000 non-null   object        
 2   purchase_date    1000 non-null   datetime64[ns]
 3   sku              1000 non-null   object        
 4   product          1000 non-null   object        
 5   quantity         1000 non-null   int64         
 6   price            1000 non-null   int64         
 7   extended amount  1000 non-null   int64         
 8   shipping_cost    1000 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(3), object(4)
memory usage: 70.4+ KB
df[df['purchase_date'] >= '2019-09-02']
invoice company purchase_date sku product quantity price extended amount shipping_cost
3 XP-005-55 Skipfire 2019-11-18 B-B-5 pen 7 29 203 4.06
5 MI-696-11 Zooveo 2019-10-17 B-BK-5 pen -1 30 -30 0.00
6 MQ-907-02 Babbleset 2019-10-27 L-579 poster 30 21 630 12.60
7 NX-102-26 Fliptune 2019-10-16 E-201 book 40 28 1120 22.40
16 IG-811-84 Yadel 2019-11-15 H-475 book 46 28 1288 25.76
... ... ... ... ... ... ... ... ... ...
994 ON-247-90 Photobug 2019-09-29 L-G-13 shirt 40 31 1240 24.80
995 ZM-628-88 Viva 2019-09-11 B-B-5 pen -5 34 -170 0.00
996 DQ-810-46 Viva 2019-09-05 M-B-5 pen 17 32 544 10.88
998 VT-754-54 Photobean 2019-12-30 L-G-31 shirt 15 18 270 5.40
999 LS-463-74 Mybuzz 2019-11-12 S-G-73 shirt 24 34 816 16.32

331 rows × 9 columns

purchase_date = (df['purchase_date'].dt.month == 11)
product_type = (df['product'] == 'book')
df[purchase_date & product_type]
invoice company purchase_date sku product quantity price extended amount shipping_cost
16 IG-811-84 Yadel 2019-11-15 H-475 book 46 28 1288 25.76
51 MM-603-72 Snaptags 2019-11-14 P-944 book 13 25 325 6.50
67 DB-217-08 Dynabox 2019-11-17 E-327 book 17 26 442 8.84
77 TL-625-84 Snaptags 2019-11-02 H-966 book 28 33 924 18.48
148 UE-108-04 Yodo 2019-11-24 P-343 book 29 27 783 15.66
194 CP-917-92 Kamba 2019-11-13 E-777 book 41 16 656 13.12
248 LZ-428-00 Photojam 2019-11-22 H-966 book 0 25 0 0.00
303 PE-226-53 Gabvine 2019-11-22 P-393 book 19 18 342 6.84
338 EP-683-72 Npath 2019-11-22 E-961 book 1 15 15 0.30
528 QP-747-38 Ainyx 2019-11-19 P-293 book 9 31 279 5.58
554 IK-877-78 Eayo 2019-11-04 H-475 book 19 28 532 10.64
555 OJ-368-07 Devcast 2019-11-11 E-992 book 36 11 396 7.92
576 RS-914-89 Skyndu 2019-11-13 H-475 book -1 17 -17 0.00
590 OQ-663-81 Muxo 2019-11-10 P-460 book 1 11 11 0.22
660 ZC-675-39 Skinix 2019-11-26 E-368 book 22 29 638 12.76
750 UK-253-68 Leenti 2019-11-02 P-942 book 18 22 396 7.92
790 IO-056-62 Yambee 2019-11-09 P-944 book 42 18 756 15.12
795 VY-434-82 Oyondu 2019-11-24 P-343 book 8 27 216 4.32
805 PS-970-82 Gigabox 2019-11-16 E-201 book 19 28 532 10.64
846 HB-316-64 Vinder 2019-11-25 P-944 book 0 25 0 0.00
908 CB-260-83 Livepath 2019-11-11 P-942 book 20 27 540 10.80
931 CJ-981-24 Skyba 2019-11-01 P-293 book 28 23 644 12.88
932 CO-837-17 Dabshots 2019-11-19 H-475 book 50 24 1200 24.00
df[df['quantity'] > 12]
invoice company purchase_date sku product quantity price extended amount shipping_cost
0 ZN-870-29 Realcube 2019-03-05 L-Y-71 shirt 19 17 323 6.46
1 JQ-501-63 Zooxo 2019-07-09 E-532 book 30 14 420 8.40
4 NB-917-18 Bluezoom 2019-04-18 L-477 poster 36 19 684 13.68
6 MQ-907-02 Babbleset 2019-10-27 L-579 poster 30 21 630 12.60
7 NX-102-26 Fliptune 2019-10-16 E-201 book 40 28 1120 22.40
... ... ... ... ... ... ... ... ... ...
994 ON-247-90 Photobug 2019-09-29 L-G-13 shirt 40 31 1240 24.80
996 DQ-810-46 Viva 2019-09-05 M-B-5 pen 17 32 544 10.88
997 RA-147-40 Dabfeed 2019-03-24 M-809 poster 17 34 578 11.56
998 VT-754-54 Photobean 2019-12-30 L-G-31 shirt 15 18 270 5.40
999 LS-463-74 Mybuzz 2019-11-12 S-G-73 shirt 24 34 816 16.32

681 rows × 9 columns

df['min_order_size'] = 5
df.head()
invoice company purchase_date sku product quantity price extended amount shipping_cost min_order_size
0 ZN-870-29 Realcube 2019-03-05 L-Y-71 shirt 19 17 323 6.46 5
1 JQ-501-63 Zooxo 2019-07-09 E-532 book 30 14 420 8.40 5
2 FI-165-58 Dabtype 2019-08-12 S-909 poster 7 23 161 3.22 5
3 XP-005-55 Skipfire 2019-11-18 B-B-5 pen 7 29 203 4.06 5
4 NB-917-18 Bluezoom 2019-04-18 L-477 poster 36 19 684 13.68 5
small_orders = df['quantity'] < df['min_order_size']
small_orders
0      False
1      False
2      False
3      False
4      False
       ...  
995     True
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool
df.loc[small_orders, 'company':'product']
company purchase_date sku product
5 Zooveo 2019-10-17 B-BK-5 pen
8 Buzzbean 2019-06-17 S-393 poster
19 Babbleblab 2019-10-23 M-G-21 shirt
24 Mynte 2019-11-25 M-BK-5 pen
27 DabZ 2019-01-23 M-B-5 pen
... ... ... ... ...
980 Roomm 2019-09-23 B-B-1 pen
981 Dabfeed 2019-03-25 M-271 poster
984 Reallinks 2019-06-24 M-BK-5 pen
986 Jamia 2019-11-06 L-G-13 shirt
995 Viva 2019-09-11 B-B-5 pen

176 rows × 4 columns

df.query('quantity > 12') # df[df['quantity'] > 12]
invoice company purchase_date sku product quantity price extended amount shipping_cost min_order_size
0 ZN-870-29 Realcube 2019-03-05 L-Y-71 shirt 19 17 323 6.46 5
1 JQ-501-63 Zooxo 2019-07-09 E-532 book 30 14 420 8.40 5
4 NB-917-18 Bluezoom 2019-04-18 L-477 poster 36 19 684 13.68 5
6 MQ-907-02 Babbleset 2019-10-27 L-579 poster 30 21 630 12.60 5
7 NX-102-26 Fliptune 2019-10-16 E-201 book 40 28 1120 22.40 5
... ... ... ... ... ... ... ... ... ... ...
994 ON-247-90 Photobug 2019-09-29 L-G-13 shirt 40 31 1240 24.80 5
996 DQ-810-46 Viva 2019-09-05 M-B-5 pen 17 32 544 10.88 5
997 RA-147-40 Dabfeed 2019-03-24 M-809 poster 17 34 578 11.56 5
998 VT-754-54 Photobean 2019-12-30 L-G-31 shirt 15 18 270 5.40 5
999 LS-463-74 Mybuzz 2019-11-12 S-G-73 shirt 24 34 816 16.32 5

681 rows × 10 columns