import pandas as pdМерджинг данных
input_file = "https://github.com/dm-fedorov/pandas_basic/blob/master/%D1%83%D1%80%D0%BE%D0%BA%D0%B8_pandas/03-aggregating-grouping-merging/data/raw/sample_sales.xlsx?raw=True"
level_file = "https://github.com/dm-fedorov/pandas_basic/blob/master/%D1%83%D1%80%D0%BE%D0%BA%D0%B8_pandas/03-aggregating-grouping-merging/data/raw/customer_levels.xlsx?raw=True"
extra_transactions = "https://github.com/dm-fedorov/pandas_basic/blob/master/%D1%83%D1%80%D0%BE%D0%BA%D0%B8_pandas/03-aggregating-grouping-merging/data/raw/sample_sales_50_extra.xlsx?raw=True"sales = pd.read_excel(input_file)
levels = pd.read_excel(level_file)
extra_sales = pd.read_excel(extra_transactions)sales.head()| invoice | company | purchase_date | product | quantity | price | extended amount | |
|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | shirt | 19 | 17 | 323 |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | book | 30 | 14 | 420 |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | poster | 7 | 23 | 161 |
| 3 | XP-005-55 | Skipfire | 2019-11-18 | pen | 7 | 29 | 203 |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | poster | 36 | 19 | 684 |
sales.shape(1000, 7)
levels.head()| Company Name | level | |
|---|---|---|
| 0 | Abatz | diamond |
| 1 | Agivu | silver |
| 2 | Aibox | platinum |
| 3 | Ailane | silver |
| 4 | Aimbo | diamond |
levels.shape(351, 2)
extra_sales.shape(50, 7)
extra_sales.head()| invoice | company | purchase_date | product | quantity | price | extended amount | |
|---|---|---|---|---|---|---|---|
| 0 | ZN-870-293 | Linkbuzz | 2019-01-28 | shirt | 28 | 17 | 476 |
| 1 | JQ-501-633 | Aimbu | 2019-10-04 | book | 21 | 14 | 294 |
| 2 | FI-165-583 | Dablist | 2019-05-05 | poster | 4 | 23 | 92 |
| 3 | XP-005-553 | Rhycero | 2019-11-03 | pen | 49 | 29 | 1421 |
| 4 | NB-917-183 | Skinder | 2019-07-17 | poster | 18 | 19 | 342 |
pd.concat([sales, extra_sales])| invoice | company | purchase_date | product | quantity | price | extended amount | |
|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | shirt | 19 | 17 | 323 |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | book | 30 | 14 | 420 |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | poster | 7 | 23 | 161 |
| 3 | XP-005-55 | Skipfire | 2019-11-18 | pen | 7 | 29 | 203 |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | poster | 36 | 19 | 684 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 45 | PS-680-563 | Quimm | 2019-09-19 | poster | 0 | 35 | 0 |
| 46 | MU-441-943 | Podcat | 2019-09-15 | shirt | 7 | 15 | 105 |
| 47 | RR-924-583 | Oyoloo | 2019-12-24 | book | 17 | 10 | 170 |
| 48 | NM-907-403 | Thoughtbeat | 2019-08-02 | poster | 8 | 26 | 208 |
| 49 | CC-124-201 | Thoughtblab | 2019-06-30 | poster | 23 | 5 | 115 |
1050 rows × 7 columns
sales.append(extra_sales) # аналогичный результатall_sales = pd.concat([sales,
extra_sales])all_sales.shape(1050, 7)
all_sales.head()| invoice | company | purchase_date | product | quantity | price | extended amount | |
|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | shirt | 19 | 17 | 323 |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | book | 30 | 14 | 420 |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | poster | 7 | 23 | 161 |
| 3 | XP-005-55 | Skipfire | 2019-11-18 | pen | 7 | 29 | 203 |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | poster | 36 | 19 | 684 |
levels.head()| Company Name | level | |
|---|---|---|
| 0 | Abatz | diamond |
| 1 | Agivu | silver |
| 2 | Aibox | platinum |
| 3 | Ailane | silver |
| 4 | Aimbo | diamond |
# This will give an error
pd.merge(all_sales, levels)pd.merge(all_sales,
levels,
left_on='company',
right_on='Company Name')| invoice | company | purchase_date | product | quantity | price | extended amount | Company Name | level | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | shirt | 19 | 17 | 323 | Realcube | gold |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | book | 30 | 14 | 420 | Zooxo | platinum |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | poster | 7 | 23 | 161 | Dabtype | gold |
| 3 | XP-005-55 | Skipfire | 2019-11-18 | pen | 7 | 29 | 203 | Skipfire | gold |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | poster | 36 | 19 | 684 | Bluezoom | gold |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1045 | PS-680-563 | Quimm | 2019-09-19 | poster | 0 | 35 | 0 | Quimm | gold |
| 1046 | MU-441-943 | Podcat | 2019-09-15 | shirt | 7 | 15 | 105 | Podcat | silver |
| 1047 | RR-924-583 | Oyoloo | 2019-12-24 | book | 17 | 10 | 170 | Oyoloo | silver |
| 1048 | NM-907-403 | Thoughtbeat | 2019-08-02 | poster | 8 | 26 | 208 | Thoughtbeat | silver |
| 1049 | CC-124-201 | Thoughtblab | 2019-06-30 | poster | 23 | 5 | 115 | Thoughtblab | gold |
1050 rows × 9 columns
levels.head()| Company Name | level | |
|---|---|---|
| 0 | Abatz | diamond |
| 1 | Agivu | silver |
| 2 | Aibox | platinum |
| 3 | Ailane | silver |
| 4 | Aimbo | diamond |
levels = levels.rename(
columns={'Company Name': 'company'}
)levels.head()| company | level | |
|---|---|---|
| 0 | Abatz | diamond |
| 1 | Agivu | silver |
| 2 | Aibox | platinum |
| 3 | Ailane | silver |
| 4 | Aimbo | diamond |
all_sales.head()| invoice | company | purchase_date | product | quantity | price | extended amount | |
|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | shirt | 19 | 17 | 323 |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | book | 30 | 14 | 420 |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | poster | 7 | 23 | 161 |
| 3 | XP-005-55 | Skipfire | 2019-11-18 | pen | 7 | 29 | 203 |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | poster | 36 | 19 | 684 |
pd.merge(all_sales, levels)| invoice | company | purchase_date | product | quantity | price | extended amount | level | |
|---|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | shirt | 19 | 17 | 323 | gold |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | book | 30 | 14 | 420 | platinum |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | poster | 7 | 23 | 161 | gold |
| 3 | XP-005-55 | Skipfire | 2019-11-18 | pen | 7 | 29 | 203 | gold |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | poster | 36 | 19 | 684 | gold |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1045 | PS-680-563 | Quimm | 2019-09-19 | poster | 0 | 35 | 0 | gold |
| 1046 | MU-441-943 | Podcat | 2019-09-15 | shirt | 7 | 15 | 105 | silver |
| 1047 | RR-924-583 | Oyoloo | 2019-12-24 | book | 17 | 10 | 170 | silver |
| 1048 | NM-907-403 | Thoughtbeat | 2019-08-02 | poster | 8 | 26 | 208 | silver |
| 1049 | CC-124-201 | Thoughtblab | 2019-06-30 | poster | 23 | 5 | 115 | gold |
1050 rows × 8 columns
final_data = pd.merge(all_sales, levels)final_data.head()| invoice | company | purchase_date | product | quantity | price | extended amount | level | |
|---|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | shirt | 19 | 17 | 323 | gold |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | book | 30 | 14 | 420 | platinum |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | poster | 7 | 23 | 161 | gold |
| 3 | XP-005-55 | Skipfire | 2019-11-18 | pen | 7 | 29 | 203 | gold |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | poster | 36 | 19 | 684 | gold |
final_data.shape(1050, 8)
final_data.tail()| invoice | company | purchase_date | product | quantity | price | extended amount | level | |
|---|---|---|---|---|---|---|---|---|
| 1045 | PS-680-563 | Quimm | 2019-09-19 | poster | 0 | 35 | 0 | gold |
| 1046 | MU-441-943 | Podcat | 2019-09-15 | shirt | 7 | 15 | 105 | silver |
| 1047 | RR-924-583 | Oyoloo | 2019-12-24 | book | 17 | 10 | 170 | silver |
| 1048 | NM-907-403 | Thoughtbeat | 2019-08-02 | poster | 8 | 26 | 208 | silver |
| 1049 | CC-124-201 | Thoughtblab | 2019-06-30 | poster | 23 | 5 | 115 | gold |