Чтение файлов

import pandas as pd
src_file = "https://github.com/dm-fedorov/pandas_basic/blob/master/%D1%83%D1%80%D0%BE%D0%BA%D0%B8_pandas/04-data-input-output/data/raw/excel_complicated_document.xlsx?raw=True"
df = pd.read_excel(src_file)
#!pip uninstall openpyxl
#!pip install openpyxl
df.head()
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Empty DataFrame
df = pd.read_excel(src_file,
                   sheet_name='sales data')
df.head()
invoice company purchase_date product quantity price extended amount Average Price 22.816
0 ZN-870-29 Realcube 2019-03-05 shirt 19 17 323 Standard Deviation 7.533269
1 JQ-501-63 Zooxo 2019-07-09 book 30 14 420 NaN NaN
2 FI-165-58 Dabtype 2019-08-12 poster 7 23 161 Total Qty Sold NaN
3 XP-005-55 Skipfire 2019-11-18 pen 7 29 203 shirt 6249.000000
4 NB-917-18 Bluezoom 2019-04-18 poster 36 19 684 book 5340.000000
df = pd.read_excel(src_file,
                   sheet_name='sales data',
                   usecols='A:G')
df.head()
invoice company purchase_date product quantity price extended amount
0 ZN-870-29 Realcube 2019-03-05 shirt 19 17 323
1 JQ-501-63 Zooxo 2019-07-09 book 30 14 420
2 FI-165-58 Dabtype 2019-08-12 poster 7 23 161
3 XP-005-55 Skipfire 2019-11-18 pen 7 29 203
4 NB-917-18 Bluezoom 2019-04-18 poster 36 19 684
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   invoice          1000 non-null   object        
 1   company          1000 non-null   object        
 2   purchase_date    1000 non-null   datetime64[ns]
 3   product          1000 non-null   object        
 4   quantity         1000 non-null   int64         
 5   price            1000 non-null   int64         
 6   extended amount  1000 non-null   int64         
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 54.8+ KB
df_levels = pd.read_excel(src_file,
                          sheet_name='levels')
df_levels.head()
Compiled by hand Unnamed: 1 Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5 Unnamed: 6 Unnamed: 7
0 NaN NaN New Levels from the CMO NaN NaN NaN NaN NaN
1 Company Name Notes level ZipCode update_month update_day update_year pct_to_target
2 Abatz Not Sure about this one diamond 14150 3 1 2019 100%
3 Agivu NaN silver 61354 3 1 2019 90%
4 Aibox NaN platinum 04442 3 1 2019 105%
df_levels = pd.read_excel(src_file,
                          sheet_name='levels',
                          header=2)
df_levels.head()
Company Name Notes level ZipCode update_month update_day update_year pct_to_target
0 Abatz Not Sure about this one diamond 14150 3 1 2019 100%
1 Agivu NaN silver 61354 3 1 2019 90%
2 Aibox NaN platinum 4442 3 1 2019 105%
3 Ailane NaN silver 78595 3 1 2019 60%
4 Aimbo NaN diamond 55447 3 1 2019 100%
df_levels = pd.read_excel(src_file,
                          sheet_name='levels',
                          header=2,
                          usecols='A,C:H')
df_levels.head()
Company Name level ZipCode update_month update_day update_year pct_to_target
0 Abatz diamond 14150 3 1 2019 100%
1 Agivu silver 61354 3 1 2019 90%
2 Aibox platinum 4442 3 1 2019 105%
3 Ailane silver 78595 3 1 2019 60%
4 Aimbo diamond 55447 3 1 2019 100%

Working with data types

df_levels.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Company Name   351 non-null    object
 1   level          351 non-null    object
 2   ZipCode        351 non-null    int64 
 3   update_month   351 non-null    int64 
 4   update_day     351 non-null    int64 
 5   update_year    351 non-null    int64 
 6   pct_to_target  351 non-null    object
dtypes: int64(4), object(3)
memory usage: 19.3+ KB
df_levels = pd.read_excel(src_file,
                          sheet_name='levels',
                          header=2,
                          usecols='A,C:H',
                          parse_dates={'update_date': [3,4,5]})
df_levels.head()
update_date Company Name level ZipCode pct_to_target
0 2019-03-01 Abatz diamond 14150 100%
1 2019-03-01 Agivu silver 61354 90%
2 2019-03-01 Aibox platinum 4442 105%
3 2019-03-01 Ailane silver 78595 60%
4 2019-03-01 Aimbo diamond 55447 100%
df_levels.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   update_date    351 non-null    datetime64[ns]
 1   Company Name   351 non-null    object        
 2   level          351 non-null    object        
 3   ZipCode        351 non-null    int64         
 4   pct_to_target  351 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 13.8+ KB
df_levels.head() # индекс пятизначный
update_date Company Name level ZipCode pct_to_target
0 2019-03-01 Abatz diamond 14150 100%
1 2019-03-01 Agivu silver 61354 90%
2 2019-03-01 Aibox platinum 4442 105%
3 2019-03-01 Ailane silver 78595 60%
4 2019-03-01 Aimbo diamond 55447 100%
df_levels.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   update_date    351 non-null    datetime64[ns]
 1   Company Name   351 non-null    object        
 2   level          351 non-null    object        
 3   ZipCode        351 non-null    int64         
 4   pct_to_target  351 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 13.8+ KB
df_levels = pd.read_excel(src_file,
                          sheet_name='levels',
                          header=2,
                          usecols='A,C:H',
                          parse_dates={'update_date': [3,4,5]},
                          dtype={'ZipCode': 'object'})
df_levels.head()
update_date Company Name level ZipCode pct_to_target
0 2019-03-01 Abatz diamond 14150 100%
1 2019-03-01 Agivu silver 61354 90%
2 2019-03-01 Aibox platinum 04442 105%
3 2019-03-01 Ailane silver 78595 60%
4 2019-03-01 Aimbo diamond 55447 100%
df_levels.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   update_date    351 non-null    datetime64[ns]
 1   Company Name   351 non-null    object        
 2   level          351 non-null    object        
 3   ZipCode        351 non-null    object        
 4   pct_to_target  351 non-null    object        
dtypes: datetime64[ns](1), object(4)
memory usage: 13.8+ KB
def convert_percent(val):
    new_val = val.replace('%', '')
    return float(new_val)/100
convert_percent("87%")
0.87
df_levels['pct_to_target']
0      100%
1       90%
2      105%
3       60%
4      100%
       ... 
346    105%
347     60%
348    100%
349     80%
350    105%
Name: pct_to_target, Length: 351, dtype: object
df_levels['pct_to_target'].apply(convert_percent)
0      1.00
1      0.90
2      1.05
3      0.60
4      1.00
       ... 
346    1.05
347    0.60
348    1.00
349    0.80
350    1.05
Name: pct_to_target, Length: 351, dtype: float64
df_levels = pd.read_excel(src_file,
                          sheet_name='levels',
                          header=2,
                          usecols='A,C:H',
                          parse_dates={'update_date': [3,4,5]},
                          dtype={'ZipCode': 'object'},
                          converters={'pct_to_target': convert_percent})
df_levels.head()
update_date Company Name level ZipCode pct_to_target
0 2019-03-01 Abatz diamond 14150 1.00
1 2019-03-01 Agivu silver 61354 0.90
2 2019-03-01 Aibox platinum 04442 1.05
3 2019-03-01 Ailane silver 78595 0.60
4 2019-03-01 Aimbo diamond 55447 1.00
df_levels.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 351 entries, 0 to 350
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   update_date    351 non-null    datetime64[ns]
 1   Company Name   351 non-null    object        
 2   level          351 non-null    object        
 3   ZipCode        351 non-null    object        
 4   pct_to_target  351 non-null    float64       
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 13.8+ KB