import pandas as pdКейс про анализ продаж 2
url = "https://github.com/dm-fedorov/pandas_basic/blob/master/%D1%83%D1%80%D0%BE%D0%BA%D0%B8_pandas/02-data-wrangling/data/raw/sample_sales_details.xlsx?raw=True"
df = pd.read_excel(url)type(df)pandas.core.frame.DataFrame
df.head(10)| invoice | company | purchase_date | sku | product | quantity | price | extended amount | shipping_cost | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | L-Y-71 | shirt | 19 | 17 | 323 | 6.46 |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | E-532 | book | 30 | 14 | 420 | 8.40 |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | S-909 | poster | 7 | 23 | 161 | 3.22 |
| 3 | XP-005-55 | Skipfire | 2019-11-18 | B-B-5 | pen | 7 | 29 | 203 | 4.06 |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | L-477 | poster | 36 | 19 | 684 | 13.68 |
| 5 | MI-696-11 | Zooveo | 2019-10-17 | B-BK-5 | pen | -1 | 30 | -30 | 0.00 |
| 6 | MQ-907-02 | Babbleset | 2019-10-27 | L-579 | poster | 30 | 21 | 630 | 12.60 |
| 7 | NX-102-26 | Fliptune | 2019-10-16 | E-201 | book | 40 | 28 | 1120 | 22.40 |
| 8 | LE-516-00 | Buzzbean | 2019-06-17 | S-393 | poster | -3 | 16 | -48 | 0.00 |
| 9 | VD-518-20 | Dabshots | 2019-03-12 | XL-B-12 | shirt | 19 | 28 | 532 | 10.64 |
df.info() # to_datetime()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 invoice 1000 non-null object
1 company 1000 non-null object
2 purchase_date 1000 non-null datetime64[ns]
3 sku 1000 non-null object
4 product 1000 non-null object
5 quantity 1000 non-null int64
6 price 1000 non-null int64
7 extended amount 1000 non-null int64
8 shipping_cost 1000 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(3), object(4)
memory usage: 70.4+ KB
df.indexRangeIndex(start=0, stop=1000, step=1)
df.columnsIndex(['invoice', 'company', 'purchase_date', 'sku', 'product', 'quantity',
'price', 'extended amount', 'shipping_cost'],
dtype='object')
df.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 invoice 1000 non-null object
1 company 1000 non-null object
2 purchase_date 1000 non-null datetime64[ns]
3 sku 1000 non-null object
4 product 1000 non-null object
5 quantity 1000 non-null int64
6 price 1000 non-null int64
7 extended amount 1000 non-null int64
8 shipping_cost 1000 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(3), object(4)
memory usage: 70.4+ KB
df['purchase_date']0 2019-03-05
1 2019-07-09
2 2019-08-12
3 2019-11-18
4 2019-04-18
...
995 2019-09-11
996 2019-09-05
997 2019-03-24
998 2019-12-30
999 2019-11-12
Name: purchase_date, Length: 1000, dtype: datetime64[ns]
df['purchase_date'].indexRangeIndex(start=0, stop=1000, step=1)
df['purchase_date'].values[0]numpy.datetime64('2019-03-05T00:00:00.000000000')
type(df['purchase_date'])pandas.core.series.Series
df.purchase_date0 2019-03-05
1 2019-07-09
2 2019-08-12
3 2019-11-18
4 2019-04-18
...
995 2019-09-11
996 2019-09-05
997 2019-03-24
998 2019-12-30
999 2019-11-12
Name: purchase_date, Length: 1000, dtype: datetime64[ns]
df.purchase_date.dt.year0 2019
1 2019
2 2019
3 2019
4 2019
...
995 2019
996 2019
997 2019
998 2019
999 2019
Name: purchase_date, Length: 1000, dtype: int32
df.purchase_date.dt.year0 2019
1 2019
2 2019
3 2019
4 2019
...
995 2019
996 2019
997 2019
998 2019
999 2019
Name: purchase_date, Length: 1000, dtype: int32
df['purchase_date'].dt<pandas.core.indexes.accessors.DatetimeProperties object at 0x0000027BB0859DC0>
#@title
df['purchase_date'].dt.month # аксессор0 3
1 7
2 8
3 11
4 4
..
995 9
996 9
997 3
998 12
999 11
Name: purchase_date, Length: 1000, dtype: int32
#@title
df['purchase_date'].dt.year0 2019
1 2019
2 2019
3 2019
4 2019
...
995 2019
996 2019
997 2019
998 2019
999 2019
Name: purchase_date, Length: 1000, dtype: int32
df['purchase_date'].dt.dayofweek # https://pandas.pydata.org/docs/reference/api/pandas.Series.dt.weekday.html0 1
1 1
2 0
3 0
4 3
..
995 2
996 3
997 6
998 0
999 1
Name: purchase_date, Length: 1000, dtype: int32
df['purchase_date'].dt.quarter # квартал0 1
1 3
2 3
3 4
4 2
..
995 3
996 3
997 1
998 4
999 4
Name: purchase_date, Length: 1000, dtype: int32
df['purchase_date'].dt.days_in_month0 31
1 31
2 31
3 30
4 30
..
995 30
996 30
997 31
998 31
999 30
Name: purchase_date, Length: 1000, dtype: int32
df['purchase_date'].dt.is_month_end0 False
1 False
2 False
3 False
4 False
...
995 False
996 False
997 False
998 False
999 False
Name: purchase_date, Length: 1000, dtype: bool
df.head() # изменений не произошло| invoice | company | purchase_date | sku | product | quantity | price | extended amount | shipping_cost | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | L-Y-71 | shirt | 19 | 17 | 323 | 6.46 |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | E-532 | book | 30 | 14 | 420 | 8.40 |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | S-909 | poster | 7 | 23 | 161 | 3.22 |
| 3 | XP-005-55 | Skipfire | 2019-11-18 | B-B-5 | pen | 7 | 29 | 203 | 4.06 |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | L-477 | poster | 36 | 19 | 684 | 13.68 |
# добавить в качестве столбцов таблицы
df['purchase_month'] = df['purchase_date'].dt.month
df['purchase_year'] = df['purchase_date'].dt.year
df['purchase_qrtr'] = df['purchase_date'].dt.quarterdf.head()| invoice | company | purchase_date | sku | product | quantity | price | extended amount | shipping_cost | purchase_month | purchase_year | purchase_qrtr | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | L-Y-71 | shirt | 19 | 17 | 323 | 6.46 | 3 | 2019 | 1 |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | E-532 | book | 30 | 14 | 420 | 8.40 | 7 | 2019 | 3 |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | S-909 | poster | 7 | 23 | 161 | 3.22 | 8 | 2019 | 3 |
| 3 | XP-005-55 | Skipfire | 2019-11-18 | B-B-5 | pen | 7 | 29 | 203 | 4.06 | 11 | 2019 | 4 |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | L-477 | poster | 36 | 19 | 684 | 13.68 | 4 | 2019 | 2 |
df['purchase_year'] > 20180 True
1 True
2 True
3 True
4 True
...
995 True
996 True
997 True
998 True
999 True
Name: purchase_year, Length: 1000, dtype: bool
df[df['purchase_year'] > 2018]| invoice | company | purchase_date | sku | product | quantity | price | extended amount | shipping_cost | purchase_month | purchase_year | purchase_qrtr | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | L-Y-71 | shirt | 19 | 17 | 323 | 6.46 | 3 | 2019 | 1 |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | E-532 | book | 30 | 14 | 420 | 8.40 | 7 | 2019 | 3 |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | S-909 | poster | 7 | 23 | 161 | 3.22 | 8 | 2019 | 3 |
| 3 | XP-005-55 | Skipfire | 2019-11-18 | B-B-5 | pen | 7 | 29 | 203 | 4.06 | 11 | 2019 | 4 |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | L-477 | poster | 36 | 19 | 684 | 13.68 | 4 | 2019 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 995 | ZM-628-88 | Viva | 2019-09-11 | B-B-5 | pen | -5 | 34 | -170 | 0.00 | 9 | 2019 | 3 |
| 996 | DQ-810-46 | Viva | 2019-09-05 | M-B-5 | pen | 17 | 32 | 544 | 10.88 | 9 | 2019 | 3 |
| 997 | RA-147-40 | Dabfeed | 2019-03-24 | M-809 | poster | 17 | 34 | 578 | 11.56 | 3 | 2019 | 1 |
| 998 | VT-754-54 | Photobean | 2019-12-30 | L-G-31 | shirt | 15 | 18 | 270 | 5.40 | 12 | 2019 | 4 |
| 999 | LS-463-74 | Mybuzz | 2019-11-12 | S-G-73 | shirt | 24 | 34 | 816 | 16.32 | 11 | 2019 | 4 |
1000 rows × 12 columns
df[df['purchase_year'] > 2018].head(3)| invoice | company | purchase_date | sku | product | quantity | price | extended amount | shipping_cost | purchase_month | purchase_year | purchase_qrtr | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | L-Y-71 | shirt | 19 | 17 | 323 | 6.46 | 3 | 2019 | 1 |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | E-532 | book | 30 | 14 | 420 | 8.40 | 7 | 2019 | 3 |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | S-909 | poster | 7 | 23 | 161 | 3.22 | 8 | 2019 | 3 |
df.query("purchase_year > 2018").head(3)| invoice | company | purchase_date | sku | product | quantity | price | extended amount | shipping_cost | purchase_month | purchase_year | purchase_qrtr | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | L-Y-71 | shirt | 19 | 17 | 323 | 6.46 | 3 | 2019 | 1 |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | E-532 | book | 30 | 14 | 420 | 8.40 | 7 | 2019 | 3 |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | S-909 | poster | 7 | 23 | 161 | 3.22 | 8 | 2019 | 3 |
Working with strings
df.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 invoice 1000 non-null object
1 company 1000 non-null object
2 purchase_date 1000 non-null datetime64[ns]
3 sku 1000 non-null object
4 product 1000 non-null object
5 quantity 1000 non-null int64
6 price 1000 non-null int64
7 extended amount 1000 non-null int64
8 shipping_cost 1000 non-null float64
9 purchase_month 1000 non-null int32
10 purchase_year 1000 non-null int32
11 purchase_qrtr 1000 non-null int32
dtypes: datetime64[ns](1), float64(1), int32(3), int64(3), object(4)
memory usage: 82.2+ KB
df['company']0 Realcube
1 Zooxo
2 Dabtype
3 Skipfire
4 Bluezoom
...
995 Viva
996 Viva
997 Dabfeed
998 Photobean
999 Mybuzz
Name: company, Length: 1000, dtype: object
df.company0 Realcube
1 Zooxo
2 Dabtype
3 Skipfire
4 Bluezoom
...
995 Viva
996 Viva
997 Dabfeed
998 Photobean
999 Mybuzz
Name: company, Length: 1000, dtype: object
"hello".replace('e', 'll')'hllllo'
df.company0 Realcube
1 Zooxo
2 Dabtype
3 Skipfire
4 Bluezoom
...
995 Viva
996 Viva
997 Dabfeed
998 Photobean
999 Mybuzz
Name: company, Length: 1000, dtype: object
df.company.str.df.company.str.lower()0 realcube
1 zooxo
2 dabtype
3 skipfire
4 bluezoom
...
995 viva
996 viva
997 dabfeed
998 photobean
999 mybuzz
Name: company, Length: 1000, dtype: object
df.company.str.replace('Vi', 'va')0 Realcube
1 Zooxo
2 Dabtype
3 Skipfire
4 Bluezoom
...
995 vava
996 vava
997 Dabfeed
998 Photobean
999 Mybuzz
Name: company, Length: 1000, dtype: object
df['company'].str.upper()0 REALCUBE
1 ZOOXO
2 DABTYPE
3 SKIPFIRE
4 BLUEZOOM
...
995 VIVA
996 VIVA
997 DABFEED
998 PHOTOBEAN
999 MYBUZZ
Name: company, Length: 1000, dtype: object
'hello'.upper()'HELLO'
df['company'].str.lower()0 realcube
1 zooxo
2 dabtype
3 skipfire
4 bluezoom
...
995 viva
996 viva
997 dabfeed
998 photobean
999 mybuzz
Name: company, Length: 1000, dtype: object
df['company'].str.title()0 Realcube
1 Zooxo
2 Dabtype
3 Skipfire
4 Bluezoom
...
995 Viva
996 Viva
997 Dabfeed
998 Photobean
999 Mybuzz
Name: company, Length: 1000, dtype: object
df['company'].str.len()0 8
1 5
2 7
3 8
4 8
..
995 4
996 4
997 7
998 9
999 6
Name: company, Length: 1000, dtype: int64
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
'Eric Idle', 'Terry Jones', 'Michael Palin'])monte0 Graham Chapman
1 John Cleese
2 Terry Gilliam
3 Eric Idle
4 Terry Jones
5 Michael Palin
dtype: object
monte.str.split()0 [Graham, Chapman]
1 [John, Cleese]
2 [Terry, Gilliam]
3 [Eric, Idle]
4 [Terry, Jones]
5 [Michael, Palin]
dtype: object
monte.str.split().str.get(1)0 Chapman
1 Cleese
2 Gilliam
3 Idle
4 Jones
5 Palin
dtype: object
import seaborn as sns
sns.set_theme(style="ticks")
df['company'].str.len().plot(kind='hist');
df.company.str.replace('a', 'b')0 Reblcube
1 Zooxo
2 Dbbtype
3 Skipfire
4 Bluezoom
...
995 Vivb
996 Vivb
997 Dbbfeed
998 Photobebn
999 Mybuzz
Name: company, Length: 1000, dtype: object
df['upper_company'] = df['company'].str.upper()df.head()| invoice | company | purchase_date | sku | product | quantity | price | extended amount | shipping_cost | purchase_month | purchase_year | purchase_qrtr | upper_company | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | L-Y-71 | shirt | 19 | 17 | 323 | 6.46 | 3 | 2019 | 1 | REALCUBE |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | E-532 | book | 30 | 14 | 420 | 8.40 | 7 | 2019 | 3 | ZOOXO |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | S-909 | poster | 7 | 23 | 161 | 3.22 | 8 | 2019 | 3 | DABTYPE |
| 3 | XP-005-55 | Skipfire | 2019-11-18 | B-B-5 | pen | 7 | 29 | 203 | 4.06 | 11 | 2019 | 4 | SKIPFIRE |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | L-477 | poster | 36 | 19 | 684 | 13.68 | 4 | 2019 | 2 | BLUEZOOM |
df.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 invoice 1000 non-null object
1 company 1000 non-null object
2 purchase_date 1000 non-null datetime64[ns]
3 sku 1000 non-null object
4 product 1000 non-null object
5 quantity 1000 non-null int64
6 price 1000 non-null int64
7 extended amount 1000 non-null int64
8 shipping_cost 1000 non-null float64
9 purchase_month 1000 non-null int32
10 purchase_year 1000 non-null int32
11 purchase_qrtr 1000 non-null int32
12 upper_company 1000 non-null object
dtypes: datetime64[ns](1), float64(1), int32(3), int64(3), object(5)
memory usage: 90.0+ KB
df['extended amount'] * .90 290.7
1 378.0
2 144.9
3 182.7
4 615.6
...
995 -153.0
996 489.6
997 520.2
998 243.0
999 734.4
Name: extended amount, Length: 1000, dtype: float64
df['extended amount'].mul(.9)0 290.7
1 378.0
2 144.9
3 182.7
4 615.6
...
995 -153.0
996 489.6
997 520.2
998 243.0
999 734.4
Name: extended amount, Length: 1000, dtype: float64
df['extended amount']0 323
1 420
2 161
3 203
4 684
...
995 -170
996 544
997 578
998 270
999 816
Name: extended amount, Length: 1000, dtype: int64
df['new_price'] = 1.05 * df['price']
df['new_extended_amount'] = df['new_price'] * df['quantity']df.head()| invoice | company | purchase_date | sku | product | quantity | price | extended amount | shipping_cost | purchase_month | purchase_year | purchase_qrtr | upper_company | new_price | new_extended_amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | L-Y-71 | shirt | 19 | 17 | 323 | 6.46 | 3 | 2019 | 1 | REALCUBE | 17.85 | 339.15 |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | E-532 | book | 30 | 14 | 420 | 8.40 | 7 | 2019 | 3 | ZOOXO | 14.70 | 441.00 |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | S-909 | poster | 7 | 23 | 161 | 3.22 | 8 | 2019 | 3 | DABTYPE | 24.15 | 169.05 |
| 3 | XP-005-55 | Skipfire | 2019-11-18 | B-B-5 | pen | 7 | 29 | 203 | 4.06 | 11 | 2019 | 4 | SKIPFIRE | 30.45 | 213.15 |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | L-477 | poster | 36 | 19 | 684 | 13.68 | 4 | 2019 | 2 | BLUEZOOM | 19.95 | 718.20 |
df[['extended amount', 'new_extended_amount']].min()extended amount -170.0
new_extended_amount -178.5
dtype: float64
df[['extended amount', 'new_extended_amount']].describe().T| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| extended amount | 1000.0 | 510.2700 | 426.411667 | -170.0 | 168.7500 | 435.00 | 798.500 | 1715.00 |
| new_extended_amount | 1000.0 | 535.7835 | 447.732251 | -178.5 | 177.1875 | 456.75 | 838.425 | 1800.75 |