Мерджинг данных

import pandas as pd
input_file = "https://github.com/dm-fedorov/pandas_basic/blob/master/%D1%83%D1%80%D0%BE%D0%BA%D0%B8_pandas/03-aggregating-grouping-merging/data/raw/sample_sales.xlsx?raw=True"
level_file = "https://github.com/dm-fedorov/pandas_basic/blob/master/%D1%83%D1%80%D0%BE%D0%BA%D0%B8_pandas/03-aggregating-grouping-merging/data/raw/customer_levels.xlsx?raw=True"
extra_transactions = "https://github.com/dm-fedorov/pandas_basic/blob/master/%D1%83%D1%80%D0%BE%D0%BA%D0%B8_pandas/03-aggregating-grouping-merging/data/raw/sample_sales_50_extra.xlsx?raw=True"
sales = pd.read_excel(input_file)
levels = pd.read_excel(level_file)
extra_sales = pd.read_excel(extra_transactions)
sales.head()
invoice company purchase_date product quantity price extended amount
0 ZN-870-29 Realcube 2019-03-05 shirt 19 17 323
1 JQ-501-63 Zooxo 2019-07-09 book 30 14 420
2 FI-165-58 Dabtype 2019-08-12 poster 7 23 161
3 XP-005-55 Skipfire 2019-11-18 pen 7 29 203
4 NB-917-18 Bluezoom 2019-04-18 poster 36 19 684
sales.shape
(1000, 7)
levels.head()
Company Name level
0 Abatz diamond
1 Agivu silver
2 Aibox platinum
3 Ailane silver
4 Aimbo diamond
levels.shape
(351, 2)
extra_sales.shape
(50, 7)
extra_sales.head()
invoice company purchase_date product quantity price extended amount
0 ZN-870-293 Linkbuzz 2019-01-28 shirt 28 17 476
1 JQ-501-633 Aimbu 2019-10-04 book 21 14 294
2 FI-165-583 Dablist 2019-05-05 poster 4 23 92
3 XP-005-553 Rhycero 2019-11-03 pen 49 29 1421
4 NB-917-183 Skinder 2019-07-17 poster 18 19 342
pd.concat([sales, extra_sales])
invoice company purchase_date product quantity price extended amount
0 ZN-870-29 Realcube 2019-03-05 shirt 19 17 323
1 JQ-501-63 Zooxo 2019-07-09 book 30 14 420
2 FI-165-58 Dabtype 2019-08-12 poster 7 23 161
3 XP-005-55 Skipfire 2019-11-18 pen 7 29 203
4 NB-917-18 Bluezoom 2019-04-18 poster 36 19 684
... ... ... ... ... ... ... ...
45 PS-680-563 Quimm 2019-09-19 poster 0 35 0
46 MU-441-943 Podcat 2019-09-15 shirt 7 15 105
47 RR-924-583 Oyoloo 2019-12-24 book 17 10 170
48 NM-907-403 Thoughtbeat 2019-08-02 poster 8 26 208
49 CC-124-201 Thoughtblab 2019-06-30 poster 23 5 115

1050 rows × 7 columns

sales.append(extra_sales) # аналогичный результат
all_sales = pd.concat([sales,
                       extra_sales])
all_sales.shape
(1050, 7)
all_sales.head()
invoice company purchase_date product quantity price extended amount
0 ZN-870-29 Realcube 2019-03-05 shirt 19 17 323
1 JQ-501-63 Zooxo 2019-07-09 book 30 14 420
2 FI-165-58 Dabtype 2019-08-12 poster 7 23 161
3 XP-005-55 Skipfire 2019-11-18 pen 7 29 203
4 NB-917-18 Bluezoom 2019-04-18 poster 36 19 684
levels.head()
Company Name level
0 Abatz diamond
1 Agivu silver
2 Aibox platinum
3 Ailane silver
4 Aimbo diamond
# This will give an error
pd.merge(all_sales, levels)
pd.merge(all_sales,
         levels,
         left_on='company',
         right_on='Company Name')
invoice company purchase_date product quantity price extended amount Company Name level
0 ZN-870-29 Realcube 2019-03-05 shirt 19 17 323 Realcube gold
1 JQ-501-63 Zooxo 2019-07-09 book 30 14 420 Zooxo platinum
2 FI-165-58 Dabtype 2019-08-12 poster 7 23 161 Dabtype gold
3 XP-005-55 Skipfire 2019-11-18 pen 7 29 203 Skipfire gold
4 NB-917-18 Bluezoom 2019-04-18 poster 36 19 684 Bluezoom gold
... ... ... ... ... ... ... ... ... ...
1045 PS-680-563 Quimm 2019-09-19 poster 0 35 0 Quimm gold
1046 MU-441-943 Podcat 2019-09-15 shirt 7 15 105 Podcat silver
1047 RR-924-583 Oyoloo 2019-12-24 book 17 10 170 Oyoloo silver
1048 NM-907-403 Thoughtbeat 2019-08-02 poster 8 26 208 Thoughtbeat silver
1049 CC-124-201 Thoughtblab 2019-06-30 poster 23 5 115 Thoughtblab gold

1050 rows × 9 columns

levels.head()
Company Name level
0 Abatz diamond
1 Agivu silver
2 Aibox platinum
3 Ailane silver
4 Aimbo diamond
levels = levels.rename(
                        columns={'Company Name': 'company'}
                       )
levels.head()
company level
0 Abatz diamond
1 Agivu silver
2 Aibox platinum
3 Ailane silver
4 Aimbo diamond
all_sales.head()
invoice company purchase_date product quantity price extended amount
0 ZN-870-29 Realcube 2019-03-05 shirt 19 17 323
1 JQ-501-63 Zooxo 2019-07-09 book 30 14 420
2 FI-165-58 Dabtype 2019-08-12 poster 7 23 161
3 XP-005-55 Skipfire 2019-11-18 pen 7 29 203
4 NB-917-18 Bluezoom 2019-04-18 poster 36 19 684
pd.merge(all_sales, levels)
invoice company purchase_date product quantity price extended amount level
0 ZN-870-29 Realcube 2019-03-05 shirt 19 17 323 gold
1 JQ-501-63 Zooxo 2019-07-09 book 30 14 420 platinum
2 FI-165-58 Dabtype 2019-08-12 poster 7 23 161 gold
3 XP-005-55 Skipfire 2019-11-18 pen 7 29 203 gold
4 NB-917-18 Bluezoom 2019-04-18 poster 36 19 684 gold
... ... ... ... ... ... ... ... ...
1045 PS-680-563 Quimm 2019-09-19 poster 0 35 0 gold
1046 MU-441-943 Podcat 2019-09-15 shirt 7 15 105 silver
1047 RR-924-583 Oyoloo 2019-12-24 book 17 10 170 silver
1048 NM-907-403 Thoughtbeat 2019-08-02 poster 8 26 208 silver
1049 CC-124-201 Thoughtblab 2019-06-30 poster 23 5 115 gold

1050 rows × 8 columns

final_data = pd.merge(all_sales, levels)
final_data.head()
invoice company purchase_date product quantity price extended amount level
0 ZN-870-29 Realcube 2019-03-05 shirt 19 17 323 gold
1 JQ-501-63 Zooxo 2019-07-09 book 30 14 420 platinum
2 FI-165-58 Dabtype 2019-08-12 poster 7 23 161 gold
3 XP-005-55 Skipfire 2019-11-18 pen 7 29 203 gold
4 NB-917-18 Bluezoom 2019-04-18 poster 36 19 684 gold
final_data.shape
(1050, 8)
final_data.tail()
invoice company purchase_date product quantity price extended amount level
1045 PS-680-563 Quimm 2019-09-19 poster 0 35 0 gold
1046 MU-441-943 Podcat 2019-09-15 shirt 7 15 105 silver
1047 RR-924-583 Oyoloo 2019-12-24 book 17 10 170 silver
1048 NM-907-403 Thoughtbeat 2019-08-02 poster 8 26 208 silver
1049 CC-124-201 Thoughtblab 2019-06-30 poster 23 5 115 gold