import pandas as pdКейс про анализ
url = "https://github.com/dm-fedorov/pandas_basic/blob/master/%D1%83%D1%80%D0%BE%D0%BA%D0%B8_pandas/02-data-wrangling/data/raw/sample_sales_details.xlsx?raw=True"
df = pd.read_excel(url)df| invoice | company | purchase_date | sku | product | quantity | price | extended amount | shipping_cost | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | L-Y-71 | shirt | 19 | 17 | 323 | 6.46 |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | E-532 | book | 30 | 14 | 420 | 8.40 |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | S-909 | poster | 7 | 23 | 161 | 3.22 |
| 3 | XP-005-55 | Skipfire | 2019-11-18 | B-B-5 | pen | 7 | 29 | 203 | 4.06 |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | L-477 | poster | 36 | 19 | 684 | 13.68 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 995 | ZM-628-88 | Viva | 2019-09-11 | B-B-5 | pen | -5 | 34 | -170 | 0.00 |
| 996 | DQ-810-46 | Viva | 2019-09-05 | M-B-5 | pen | 17 | 32 | 544 | 10.88 |
| 997 | RA-147-40 | Dabfeed | 2019-03-24 | M-809 | poster | 17 | 34 | 578 | 11.56 |
| 998 | VT-754-54 | Photobean | 2019-12-30 | L-G-31 | shirt | 15 | 18 | 270 | 5.40 |
| 999 | LS-463-74 | Mybuzz | 2019-11-12 | S-G-73 | shirt | 24 | 34 | 816 | 16.32 |
1000 rows × 9 columns
df['company'] == 'Viva'0 False
1 False
2 False
3 False
4 False
...
995 True
996 True
997 False
998 False
999 False
Name: company, Length: 1000, dtype: bool
viva = (df['company'] == 'Viva')df.loc[viva, ["company","quantity"]] # доп возможности фильтрации| company | quantity | |
|---|---|---|
| 186 | Viva | 8 |
| 705 | Viva | 6 |
| 840 | Viva | 46 |
| 995 | Viva | -5 |
| 996 | Viva | 17 |
df[viva] # ~| invoice | company | purchase_date | sku | product | quantity | price | extended amount | shipping_cost | |
|---|---|---|---|---|---|---|---|---|---|
| 186 | CM-205-86 | Viva | 2019-05-19 | E-327 | book | 8 | 14 | 112 | 2.24 |
| 705 | QU-986-45 | Viva | 2019-06-09 | H-966 | book | 6 | 35 | 210 | 4.20 |
| 840 | RF-796-61 | Viva | 2019-02-19 | S-393 | poster | 46 | 16 | 736 | 14.72 |
| 995 | ZM-628-88 | Viva | 2019-09-11 | B-B-5 | pen | -5 | 34 | -170 | 0.00 |
| 996 | DQ-810-46 | Viva | 2019-09-05 | M-B-5 | pen | 17 | 32 | 544 | 10.88 |
qty_10 = (df['quantity'] >= 10)qty_100 True
1 True
2 False
3 False
4 True
...
995 False
996 True
997 True
998 True
999 True
Name: quantity, Length: 1000, dtype: bool
df[(qty_10 & viva)]| invoice | company | purchase_date | sku | product | quantity | price | extended amount | shipping_cost | |
|---|---|---|---|---|---|---|---|---|---|
| 840 | RF-796-61 | Viva | 2019-02-19 | S-393 | poster | 46 | 16 | 736 | 14.72 |
| 996 | DQ-810-46 | Viva | 2019-09-05 | M-B-5 | pen | 17 | 32 | 544 | 10.88 |
df.loc[(qty_10 & viva), :]| invoice | company | purchase_date | sku | product | quantity | price | extended amount | shipping_cost | |
|---|---|---|---|---|---|---|---|---|---|
| 840 | RF-796-61 | Viva | 2019-02-19 | S-393 | poster | 46 | 16 | 736 | 14.72 |
| 996 | DQ-810-46 | Viva | 2019-09-05 | M-B-5 | pen | 17 | 32 | 544 | 10.88 |
df.loc[(qty_10 & viva) , 'purchase_date':'price']| purchase_date | sku | product | quantity | price | |
|---|---|---|---|---|---|
| 840 | 2019-02-19 | S-393 | poster | 46 | 16 |
| 996 | 2019-09-05 | M-B-5 | pen | 17 | 32 |
df['company'].str.contains('buzz')0 False
1 False
2 False
3 False
4 False
...
995 False
996 False
997 False
998 False
999 True
Name: company, Length: 1000, dtype: bool
'p' in "python"True
buzz = df['company'].str.contains('buzz')df[buzz]| invoice | company | purchase_date | sku | product | quantity | price | extended amount | shipping_cost | |
|---|---|---|---|---|---|---|---|---|---|
| 23 | NE-854-88 | Realbuzz | 2019-05-08 | M-271 | poster | 11 | 14 | 154 | 3.08 |
| 48 | NM-907-40 | Mybuzz | 2019-11-29 | S-500 | poster | 11 | 26 | 286 | 5.72 |
| 153 | KW-456-55 | Realbuzz | 2019-05-21 | S-393 | poster | 38 | 22 | 836 | 16.72 |
| 206 | GV-236-09 | Linkbuzz | 2019-06-25 | M-809 | poster | 49 | 18 | 882 | 17.64 |
| 326 | EY-555-15 | Realbuzz | 2019-04-18 | XL-B-12 | shirt | 0 | 12 | 0 | 0.00 |
| 470 | SW-903-99 | Realbuzz | 2019-07-30 | L-477 | poster | 6 | 29 | 174 | 3.48 |
| 589 | HS-879-34 | Mybuzz | 2019-02-15 | L-743 | poster | 36 | 30 | 1080 | 21.60 |
| 687 | LU-384-08 | Linkbuzz | 2019-01-15 | S-B-41 | shirt | 35 | 35 | 1225 | 24.50 |
| 947 | YW-362-87 | Linkbuzz | 2019-12-11 | P-529 | book | 45 | 17 | 765 | 15.30 |
| 999 | LS-463-74 | Mybuzz | 2019-11-12 | S-G-73 | shirt | 24 | 34 | 816 | 16.32 |
df.loc[buzz, ["sku", "product"]]| sku | product | |
|---|---|---|
| 23 | M-271 | poster |
| 48 | S-500 | poster |
| 153 | S-393 | poster |
| 206 | M-809 | poster |
| 326 | XL-B-12 | shirt |
| 470 | L-477 | poster |
| 589 | L-743 | poster |
| 687 | S-B-41 | shirt |
| 947 | P-529 | book |
| 999 | S-G-73 | shirt |
df.loc[df['sku'].str.startswith('S'), 'product']2 poster
8 poster
14 poster
34 poster
40 poster
...
975 shirt
982 shirt
992 poster
993 poster
999 shirt
Name: product, Length: 229, dtype: object
df \
.loc[df['sku'].str.startswith('S'), 'product']\
.value_counts(normalize=True)product
shirt 0.532751
poster 0.467249
Name: proportion, dtype: float64
Working with dates
df.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 invoice 1000 non-null object
1 company 1000 non-null object
2 purchase_date 1000 non-null datetime64[ns]
3 sku 1000 non-null object
4 product 1000 non-null object
5 quantity 1000 non-null int64
6 price 1000 non-null int64
7 extended amount 1000 non-null int64
8 shipping_cost 1000 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(3), object(4)
memory usage: 70.4+ KB
df[df['purchase_date'] >= '2019-09-02']| invoice | company | purchase_date | sku | product | quantity | price | extended amount | shipping_cost | |
|---|---|---|---|---|---|---|---|---|---|
| 3 | XP-005-55 | Skipfire | 2019-11-18 | B-B-5 | pen | 7 | 29 | 203 | 4.06 |
| 5 | MI-696-11 | Zooveo | 2019-10-17 | B-BK-5 | pen | -1 | 30 | -30 | 0.00 |
| 6 | MQ-907-02 | Babbleset | 2019-10-27 | L-579 | poster | 30 | 21 | 630 | 12.60 |
| 7 | NX-102-26 | Fliptune | 2019-10-16 | E-201 | book | 40 | 28 | 1120 | 22.40 |
| 16 | IG-811-84 | Yadel | 2019-11-15 | H-475 | book | 46 | 28 | 1288 | 25.76 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 994 | ON-247-90 | Photobug | 2019-09-29 | L-G-13 | shirt | 40 | 31 | 1240 | 24.80 |
| 995 | ZM-628-88 | Viva | 2019-09-11 | B-B-5 | pen | -5 | 34 | -170 | 0.00 |
| 996 | DQ-810-46 | Viva | 2019-09-05 | M-B-5 | pen | 17 | 32 | 544 | 10.88 |
| 998 | VT-754-54 | Photobean | 2019-12-30 | L-G-31 | shirt | 15 | 18 | 270 | 5.40 |
| 999 | LS-463-74 | Mybuzz | 2019-11-12 | S-G-73 | shirt | 24 | 34 | 816 | 16.32 |
331 rows × 9 columns
purchase_date = (df['purchase_date'].dt.month == 11)
product_type = (df['product'] == 'book')df[purchase_date & product_type]| invoice | company | purchase_date | sku | product | quantity | price | extended amount | shipping_cost | |
|---|---|---|---|---|---|---|---|---|---|
| 16 | IG-811-84 | Yadel | 2019-11-15 | H-475 | book | 46 | 28 | 1288 | 25.76 |
| 51 | MM-603-72 | Snaptags | 2019-11-14 | P-944 | book | 13 | 25 | 325 | 6.50 |
| 67 | DB-217-08 | Dynabox | 2019-11-17 | E-327 | book | 17 | 26 | 442 | 8.84 |
| 77 | TL-625-84 | Snaptags | 2019-11-02 | H-966 | book | 28 | 33 | 924 | 18.48 |
| 148 | UE-108-04 | Yodo | 2019-11-24 | P-343 | book | 29 | 27 | 783 | 15.66 |
| 194 | CP-917-92 | Kamba | 2019-11-13 | E-777 | book | 41 | 16 | 656 | 13.12 |
| 248 | LZ-428-00 | Photojam | 2019-11-22 | H-966 | book | 0 | 25 | 0 | 0.00 |
| 303 | PE-226-53 | Gabvine | 2019-11-22 | P-393 | book | 19 | 18 | 342 | 6.84 |
| 338 | EP-683-72 | Npath | 2019-11-22 | E-961 | book | 1 | 15 | 15 | 0.30 |
| 528 | QP-747-38 | Ainyx | 2019-11-19 | P-293 | book | 9 | 31 | 279 | 5.58 |
| 554 | IK-877-78 | Eayo | 2019-11-04 | H-475 | book | 19 | 28 | 532 | 10.64 |
| 555 | OJ-368-07 | Devcast | 2019-11-11 | E-992 | book | 36 | 11 | 396 | 7.92 |
| 576 | RS-914-89 | Skyndu | 2019-11-13 | H-475 | book | -1 | 17 | -17 | 0.00 |
| 590 | OQ-663-81 | Muxo | 2019-11-10 | P-460 | book | 1 | 11 | 11 | 0.22 |
| 660 | ZC-675-39 | Skinix | 2019-11-26 | E-368 | book | 22 | 29 | 638 | 12.76 |
| 750 | UK-253-68 | Leenti | 2019-11-02 | P-942 | book | 18 | 22 | 396 | 7.92 |
| 790 | IO-056-62 | Yambee | 2019-11-09 | P-944 | book | 42 | 18 | 756 | 15.12 |
| 795 | VY-434-82 | Oyondu | 2019-11-24 | P-343 | book | 8 | 27 | 216 | 4.32 |
| 805 | PS-970-82 | Gigabox | 2019-11-16 | E-201 | book | 19 | 28 | 532 | 10.64 |
| 846 | HB-316-64 | Vinder | 2019-11-25 | P-944 | book | 0 | 25 | 0 | 0.00 |
| 908 | CB-260-83 | Livepath | 2019-11-11 | P-942 | book | 20 | 27 | 540 | 10.80 |
| 931 | CJ-981-24 | Skyba | 2019-11-01 | P-293 | book | 28 | 23 | 644 | 12.88 |
| 932 | CO-837-17 | Dabshots | 2019-11-19 | H-475 | book | 50 | 24 | 1200 | 24.00 |
df[df['quantity'] > 12]| invoice | company | purchase_date | sku | product | quantity | price | extended amount | shipping_cost | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | L-Y-71 | shirt | 19 | 17 | 323 | 6.46 |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | E-532 | book | 30 | 14 | 420 | 8.40 |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | L-477 | poster | 36 | 19 | 684 | 13.68 |
| 6 | MQ-907-02 | Babbleset | 2019-10-27 | L-579 | poster | 30 | 21 | 630 | 12.60 |
| 7 | NX-102-26 | Fliptune | 2019-10-16 | E-201 | book | 40 | 28 | 1120 | 22.40 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 994 | ON-247-90 | Photobug | 2019-09-29 | L-G-13 | shirt | 40 | 31 | 1240 | 24.80 |
| 996 | DQ-810-46 | Viva | 2019-09-05 | M-B-5 | pen | 17 | 32 | 544 | 10.88 |
| 997 | RA-147-40 | Dabfeed | 2019-03-24 | M-809 | poster | 17 | 34 | 578 | 11.56 |
| 998 | VT-754-54 | Photobean | 2019-12-30 | L-G-31 | shirt | 15 | 18 | 270 | 5.40 |
| 999 | LS-463-74 | Mybuzz | 2019-11-12 | S-G-73 | shirt | 24 | 34 | 816 | 16.32 |
681 rows × 9 columns
df['min_order_size'] = 5df.head()| invoice | company | purchase_date | sku | product | quantity | price | extended amount | shipping_cost | min_order_size | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | L-Y-71 | shirt | 19 | 17 | 323 | 6.46 | 5 |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | E-532 | book | 30 | 14 | 420 | 8.40 | 5 |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | S-909 | poster | 7 | 23 | 161 | 3.22 | 5 |
| 3 | XP-005-55 | Skipfire | 2019-11-18 | B-B-5 | pen | 7 | 29 | 203 | 4.06 | 5 |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | L-477 | poster | 36 | 19 | 684 | 13.68 | 5 |
small_orders = df['quantity'] < df['min_order_size']
small_orders0 False
1 False
2 False
3 False
4 False
...
995 True
996 False
997 False
998 False
999 False
Length: 1000, dtype: bool
df.loc[small_orders, 'company':'product']| company | purchase_date | sku | product | |
|---|---|---|---|---|
| 5 | Zooveo | 2019-10-17 | B-BK-5 | pen |
| 8 | Buzzbean | 2019-06-17 | S-393 | poster |
| 19 | Babbleblab | 2019-10-23 | M-G-21 | shirt |
| 24 | Mynte | 2019-11-25 | M-BK-5 | pen |
| 27 | DabZ | 2019-01-23 | M-B-5 | pen |
| ... | ... | ... | ... | ... |
| 980 | Roomm | 2019-09-23 | B-B-1 | pen |
| 981 | Dabfeed | 2019-03-25 | M-271 | poster |
| 984 | Reallinks | 2019-06-24 | M-BK-5 | pen |
| 986 | Jamia | 2019-11-06 | L-G-13 | shirt |
| 995 | Viva | 2019-09-11 | B-B-5 | pen |
176 rows × 4 columns
df.query('quantity > 12') # df[df['quantity'] > 12]| invoice | company | purchase_date | sku | product | quantity | price | extended amount | shipping_cost | min_order_size | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | L-Y-71 | shirt | 19 | 17 | 323 | 6.46 | 5 |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | E-532 | book | 30 | 14 | 420 | 8.40 | 5 |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | L-477 | poster | 36 | 19 | 684 | 13.68 | 5 |
| 6 | MQ-907-02 | Babbleset | 2019-10-27 | L-579 | poster | 30 | 21 | 630 | 12.60 | 5 |
| 7 | NX-102-26 | Fliptune | 2019-10-16 | E-201 | book | 40 | 28 | 1120 | 22.40 | 5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 994 | ON-247-90 | Photobug | 2019-09-29 | L-G-13 | shirt | 40 | 31 | 1240 | 24.80 | 5 |
| 996 | DQ-810-46 | Viva | 2019-09-05 | M-B-5 | pen | 17 | 32 | 544 | 10.88 | 5 |
| 997 | RA-147-40 | Dabfeed | 2019-03-24 | M-809 | poster | 17 | 34 | 578 | 11.56 | 5 |
| 998 | VT-754-54 | Photobean | 2019-12-30 | L-G-31 | shirt | 15 | 18 | 270 | 5.40 | 5 |
| 999 | LS-463-74 | Mybuzz | 2019-11-12 | S-G-73 | shirt | 24 | 34 | 816 | 16.32 | 5 |
681 rows × 10 columns