CoCalc Public Files01_read_data_source.ipynbOpen with one click!
Authors: Roger Chen, Anonymous User-1608862054
Views : 37
Compute Environment: Ubuntu 20.04 (Default)
In [18]:
import os import pandas as pd import numpy as np # import matplotlib.pyplot as plt # import seaborn as sns data_dir = './data'
In [19]:
people = { 'eid': ['A0001', 'A0003', 'A0030', 'A0070'], 'first': ['Tom', 'John', 'Jennie', 'Alex'], 'last': ['Lee', 'Huang', 'Zhang', 'Huang'], 'mail': ['[email protected]', '[email protected]', '[email protected]', '[email protected]'], 'seniority': [7, 7, 3, 1] } df = pd.DataFrame(people) df
eid first last mail seniority
0 A0001 Tom Lee [email protected] 7
1 A0003 John Huang [email protected] 7
2 A0030 Jennie Zhang [email protected] 3
3 A0070 Alex Huang [email protected] 1
In [20]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4 entries, 0 to 3 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 eid 4 non-null object 1 first 4 non-null object 2 last 4 non-null object 3 mail 4 non-null object 4 seniority 4 non-null int64 dtypes: int64(1), object(4) memory usage: 288.0+ bytes
In [21]:
df.describe()
seniority
count 4.0
mean 4.5
std 3.0
min 1.0
25% 2.5
50% 5.0
75% 7.0
max 7.0
In [22]:
type(df['eid']) , type(df.iloc[0])
(pandas.core.series.Series, pandas.core.series.Series)
In [23]:
df.loc[df.seniority <= 4]
eid first last mail seniority
2 A0030 Jennie Zhang [email protected] 3
3 A0070 Alex Huang [email protected] 1
In [24]:
df.set_index('eid') # don't forget inplace
first last mail seniority
eid
A0001 Tom Lee [email protected] 7
A0003 John Huang [email protected] 7
A0030 Jennie Zhang [email protected] 3
A0070 Alex Huang [email protected] 1
In [ ]:
df
In [25]:
df.loc[df.seniority < 5]
eid first last mail seniority
2 A0030 Jennie Zhang [email protected] 3
3 A0070 Alex Huang [email protected] 1
In [26]:
df.iloc[0, 3]
In [25]:
del df
In [8]:
data_source = os.path.join(data_dir, 'car_data.csv') df = pd.read_csv(data_source) df
Name Car Color Price MPG
0 Jerry Honda red 8000 14
1 John Honda red 7500 14
2 Nancy Honda blue 11000 16
3 Greg Ford gray 7000 16
4 Brian Ford white 8500 15
5 Jen Ford black 12500 20
6 Sandy Ford black 12500 20
7 Gabriel Ford black 13950 20
8 Will Toyota green 300 10
9 Laura Toyota red 1000 12
10 Mel BMW black 50000 35
11 Steve Volvo blue 1000 15
12 Chris Volvo blue 250 15
13 Sandra Volvo brown 100 12
In [13]:
xls_source = 'data/CTG.xls' df = pd.read_excel(xls_source, sheet_name='Raw Data') df.dropna(inplace=True) df.head()
FileName Date SegFile b e LBE LB AC FM UC ... C D E AD DE LD FS SUSP CLASS NSP
1 Variab10.txt 1996-12-01 CTG0001.txt 240.0 357.0 120.0 120.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 9.0 2.0
2 Fmcs_1.txt 1996-05-03 CTG0002.txt 5.0 632.0 132.0 132.0 4.0 0.0 4.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 6.0 1.0
3 Fmcs_1.txt 1996-05-03 CTG0003.txt 177.0 779.0 133.0 133.0 2.0 0.0 5.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 6.0 1.0
4 Fmcs_1.txt 1996-05-03 CTG0004.txt 411.0 1192.0 134.0 134.0 2.0 0.0 6.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 6.0 1.0
5 Fmcs_1.txt 1996-05-03 CTG0005.txt 533.0 1147.0 132.0 132.0 4.0 0.0 5.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 1.0

5 rows × 40 columns

In [14]:
df.to_excel(os.path.join(data_dir, 'wq_0121_exercise.xls'))
In [15]:
%%time data_source = 'data/superstore_sales.xlsx' df = pd.read_excel(data_source, sheet_name='Orders') # df = pd.read_excel(data_source, sheet_name='Orders', engine='openpyxl') df.tail(5)
CPU times: user 14.1 s, sys: 69.3 ms, total: 14.1 s Wall time: 14.7 s
order_id order_date ship_date ship_mode customer_name segment state country market region ... category sub_category product_name sales quantity discount profit shipping_cost order_priority year
51285 CA-2014-115427 2014-12-31 2015-01-04 Standard Class Erica Bern Corporate California United States US West ... Office Supplies Binders Cardinal Slant-D Ring Binder, Heavy Gauge Vinyl 13.904 2 0.2 4.5188 0.890 Medium 2014
51286 MO-2014-2560 2014-12-31 2015-01-05 Standard Class Liz Preis Consumer Souss-Massa-Draâ Morocco Africa Africa ... Office Supplies Binders Wilson Jones Hole Reinforcements, Clear 3.990 1 0.0 0.4200 0.490 Medium 2014
51287 MX-2014-110527 2014-12-31 2015-01-02 Second Class Charlotte Melton Consumer Managua Nicaragua LATAM Central ... Office Supplies Labels Hon Color Coded Labels, 5000 Label Set 26.400 3 0.0 12.3600 0.350 Medium 2014
51288 MX-2014-114783 2014-12-31 2015-01-06 Standard Class Tamara Dahlen Consumer Chihuahua Mexico LATAM North ... Office Supplies Labels Hon Legal Exhibit Labels, Alphabetical 7.120 1 0.0 0.5600 0.199 Medium 2014
51289 CA-2014-156720 2014-12-31 2015-01-04 Standard Class Jill Matthias Consumer Colorado United States US West ... Office Supplies Fasteners Bagged Rubber Bands 3.024 3 0.2 -0.6048 0.170 Medium 2014

5 rows × 21 columns

In [16]:
df.describe()
sales quantity discount profit shipping_cost year
count 51290.000000 51290.000000 51290.000000 51290.000000 51290.000000 51290.000000
mean 246.490581 3.476545 0.142908 28.641740 26.375818 2012.777208
std 487.565361 2.278766 0.212280 174.424113 57.296810 1.098931
min 0.444000 1.000000 0.000000 -6599.978000 0.002000 2011.000000
25% 30.758625 2.000000 0.000000 0.000000 2.610000 2012.000000
50% 85.053000 3.000000 0.000000 9.240000 7.790000 2013.000000
75% 251.053200 5.000000 0.200000 36.810000 24.450000 2014.000000
max 22638.480000 14.000000 0.850000 8399.976000 933.570000 2014.000000
In [ ]: