import pandas as pdЧтение файлов
src_file = "https://github.com/dm-fedorov/pandas_basic/blob/master/%D1%83%D1%80%D0%BE%D0%BA%D0%B8_pandas/04-data-input-output/data/raw/excel_complicated_document.xlsx?raw=True"df = pd.read_excel(src_file)#!pip uninstall openpyxl#!pip install openpyxldf.head()df.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame
df = pd.read_excel(src_file,
sheet_name='sales data')df.head()| invoice | company | purchase_date | product | quantity | price | extended amount | Average Price | 22.816 | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | shirt | 19 | 17 | 323 | Standard Deviation | 7.533269 |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | book | 30 | 14 | 420 | NaN | NaN |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | poster | 7 | 23 | 161 | Total Qty Sold | NaN |
| 3 | XP-005-55 | Skipfire | 2019-11-18 | pen | 7 | 29 | 203 | shirt | 6249.000000 |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | poster | 36 | 19 | 684 | book | 5340.000000 |
df = pd.read_excel(src_file,
sheet_name='sales data',
usecols='A:G')df.head()| invoice | company | purchase_date | product | quantity | price | extended amount | |
|---|---|---|---|---|---|---|---|
| 0 | ZN-870-29 | Realcube | 2019-03-05 | shirt | 19 | 17 | 323 |
| 1 | JQ-501-63 | Zooxo | 2019-07-09 | book | 30 | 14 | 420 |
| 2 | FI-165-58 | Dabtype | 2019-08-12 | poster | 7 | 23 | 161 |
| 3 | XP-005-55 | Skipfire | 2019-11-18 | pen | 7 | 29 | 203 |
| 4 | NB-917-18 | Bluezoom | 2019-04-18 | poster | 36 | 19 | 684 |
df.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 invoice 1000 non-null object
1 company 1000 non-null object
2 purchase_date 1000 non-null datetime64[ns]
3 product 1000 non-null object
4 quantity 1000 non-null int64
5 price 1000 non-null int64
6 extended amount 1000 non-null int64
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 54.8+ KB
df_levels = pd.read_excel(src_file,
sheet_name='levels')df_levels.head()| Compiled by hand | Unnamed: 1 | Unnamed: 2 | Unnamed: 3 | Unnamed: 4 | Unnamed: 5 | Unnamed: 6 | Unnamed: 7 | |
|---|---|---|---|---|---|---|---|---|
| 0 | NaN | NaN | New Levels from the CMO | NaN | NaN | NaN | NaN | NaN |
| 1 | Company Name | Notes | level | ZipCode | update_month | update_day | update_year | pct_to_target |
| 2 | Abatz | Not Sure about this one | diamond | 14150 | 3 | 1 | 2019 | 100% |
| 3 | Agivu | NaN | silver | 61354 | 3 | 1 | 2019 | 90% |
| 4 | Aibox | NaN | platinum | 04442 | 3 | 1 | 2019 | 105% |
df_levels = pd.read_excel(src_file,
sheet_name='levels',
header=2)df_levels.head()| Company Name | Notes | level | ZipCode | update_month | update_day | update_year | pct_to_target | |
|---|---|---|---|---|---|---|---|---|
| 0 | Abatz | Not Sure about this one | diamond | 14150 | 3 | 1 | 2019 | 100% |
| 1 | Agivu | NaN | silver | 61354 | 3 | 1 | 2019 | 90% |
| 2 | Aibox | NaN | platinum | 4442 | 3 | 1 | 2019 | 105% |
| 3 | Ailane | NaN | silver | 78595 | 3 | 1 | 2019 | 60% |
| 4 | Aimbo | NaN | diamond | 55447 | 3 | 1 | 2019 | 100% |
df_levels = pd.read_excel(src_file,
sheet_name='levels',
header=2,
usecols='A,C:H')df_levels.head()| Company Name | level | ZipCode | update_month | update_day | update_year | pct_to_target | |
|---|---|---|---|---|---|---|---|
| 0 | Abatz | diamond | 14150 | 3 | 1 | 2019 | 100% |
| 1 | Agivu | silver | 61354 | 3 | 1 | 2019 | 90% |
| 2 | Aibox | platinum | 4442 | 3 | 1 | 2019 | 105% |
| 3 | Ailane | silver | 78595 | 3 | 1 | 2019 | 60% |
| 4 | Aimbo | diamond | 55447 | 3 | 1 | 2019 | 100% |
Working with data types
df_levels.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Company Name 351 non-null object
1 level 351 non-null object
2 ZipCode 351 non-null int64
3 update_month 351 non-null int64
4 update_day 351 non-null int64
5 update_year 351 non-null int64
6 pct_to_target 351 non-null object
dtypes: int64(4), object(3)
memory usage: 19.3+ KB
df_levels = pd.read_excel(src_file,
sheet_name='levels',
header=2,
usecols='A,C:H',
parse_dates={'update_date': [3,4,5]})df_levels.head()| update_date | Company Name | level | ZipCode | pct_to_target | |
|---|---|---|---|---|---|
| 0 | 2019-03-01 | Abatz | diamond | 14150 | 100% |
| 1 | 2019-03-01 | Agivu | silver | 61354 | 90% |
| 2 | 2019-03-01 | Aibox | platinum | 4442 | 105% |
| 3 | 2019-03-01 | Ailane | silver | 78595 | 60% |
| 4 | 2019-03-01 | Aimbo | diamond | 55447 | 100% |
df_levels.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 update_date 351 non-null datetime64[ns]
1 Company Name 351 non-null object
2 level 351 non-null object
3 ZipCode 351 non-null int64
4 pct_to_target 351 non-null object
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 13.8+ KB
df_levels.head() # индекс пятизначный| update_date | Company Name | level | ZipCode | pct_to_target | |
|---|---|---|---|---|---|
| 0 | 2019-03-01 | Abatz | diamond | 14150 | 100% |
| 1 | 2019-03-01 | Agivu | silver | 61354 | 90% |
| 2 | 2019-03-01 | Aibox | platinum | 4442 | 105% |
| 3 | 2019-03-01 | Ailane | silver | 78595 | 60% |
| 4 | 2019-03-01 | Aimbo | diamond | 55447 | 100% |
df_levels.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 update_date 351 non-null datetime64[ns]
1 Company Name 351 non-null object
2 level 351 non-null object
3 ZipCode 351 non-null int64
4 pct_to_target 351 non-null object
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 13.8+ KB
df_levels = pd.read_excel(src_file,
sheet_name='levels',
header=2,
usecols='A,C:H',
parse_dates={'update_date': [3,4,5]},
dtype={'ZipCode': 'object'})df_levels.head()| update_date | Company Name | level | ZipCode | pct_to_target | |
|---|---|---|---|---|---|
| 0 | 2019-03-01 | Abatz | diamond | 14150 | 100% |
| 1 | 2019-03-01 | Agivu | silver | 61354 | 90% |
| 2 | 2019-03-01 | Aibox | platinum | 04442 | 105% |
| 3 | 2019-03-01 | Ailane | silver | 78595 | 60% |
| 4 | 2019-03-01 | Aimbo | diamond | 55447 | 100% |
df_levels.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 update_date 351 non-null datetime64[ns]
1 Company Name 351 non-null object
2 level 351 non-null object
3 ZipCode 351 non-null object
4 pct_to_target 351 non-null object
dtypes: datetime64[ns](1), object(4)
memory usage: 13.8+ KB
def convert_percent(val):
new_val = val.replace('%', '')
return float(new_val)/100convert_percent("87%")0.87
df_levels['pct_to_target']0 100%
1 90%
2 105%
3 60%
4 100%
...
346 105%
347 60%
348 100%
349 80%
350 105%
Name: pct_to_target, Length: 351, dtype: object
df_levels['pct_to_target'].apply(convert_percent)0 1.00
1 0.90
2 1.05
3 0.60
4 1.00
...
346 1.05
347 0.60
348 1.00
349 0.80
350 1.05
Name: pct_to_target, Length: 351, dtype: float64
df_levels = pd.read_excel(src_file,
sheet_name='levels',
header=2,
usecols='A,C:H',
parse_dates={'update_date': [3,4,5]},
dtype={'ZipCode': 'object'},
converters={'pct_to_target': convert_percent})df_levels.head()| update_date | Company Name | level | ZipCode | pct_to_target | |
|---|---|---|---|---|---|
| 0 | 2019-03-01 | Abatz | diamond | 14150 | 1.00 |
| 1 | 2019-03-01 | Agivu | silver | 61354 | 0.90 |
| 2 | 2019-03-01 | Aibox | platinum | 04442 | 1.05 |
| 3 | 2019-03-01 | Ailane | silver | 78595 | 0.60 |
| 4 | 2019-03-01 | Aimbo | diamond | 55447 | 1.00 |
df_levels.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 update_date 351 non-null datetime64[ns]
1 Company Name 351 non-null object
2 level 351 non-null object
3 ZipCode 351 non-null object
4 pct_to_target 351 non-null float64
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 13.8+ KB