{ "cells": [ { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "import os\n", "import pandas as pd\n", "import numpy as np\n", "# import matplotlib.pyplot as plt\n", "# import seaborn as sns\n", "\n", "data_dir = './data'" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
eidfirstlastmailseniority
0A0001TomLeetom.lee@mail.com7
1A0003JohnHuangjohn.huang@mail.com7
2A0030JennieZhanghennie.zhang@mail.cim3
3A0070AlexHuangalex.huang@mail.com1
\n", "
" ], "text/plain": [ " eid first last mail seniority\n", "0 A0001 Tom Lee tom.lee@mail.com 7\n", "1 A0003 John Huang john.huang@mail.com 7\n", "2 A0030 Jennie Zhang hennie.zhang@mail.cim 3\n", "3 A0070 Alex Huang alex.huang@mail.com 1" ] }, "execution_count": 19, "metadata": { }, "output_type": "execute_result" } ], "source": [ "people = {\n", " 'eid': ['A0001', 'A0003', 'A0030', 'A0070'],\n", " 'first': ['Tom', 'John', 'Jennie', 'Alex'],\n", " 'last': ['Lee', 'Huang', 'Zhang', 'Huang'],\n", " 'mail': ['tom.lee@mail.com', 'john.huang@mail.com', 'hennie.zhang@mail.cim', 'alex.huang@mail.com'],\n", " 'seniority': [7, 7, 3, 1]\n", "}\n", "\n", "df = pd.DataFrame(people)\n", "df" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 4 entries, 0 to 3\n", "Data columns (total 5 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 eid 4 non-null object\n", " 1 first 4 non-null object\n", " 2 last 4 non-null object\n", " 3 mail 4 non-null object\n", " 4 seniority 4 non-null int64 \n", "dtypes: int64(1), object(4)\n", "memory usage: 288.0+ bytes\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
seniority
count4.0
mean4.5
std3.0
min1.0
25%2.5
50%5.0
75%7.0
max7.0
\n", "
" ], "text/plain": [ " seniority\n", "count 4.0\n", "mean 4.5\n", "std 3.0\n", "min 1.0\n", "25% 2.5\n", "50% 5.0\n", "75% 7.0\n", "max 7.0" ] }, "execution_count": 21, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "(pandas.core.series.Series, pandas.core.series.Series)" ] }, "execution_count": 22, "metadata": { }, "output_type": "execute_result" } ], "source": [ "type(df['eid']) , type(df.iloc[0])" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
eidfirstlastmailseniority
2A0030JennieZhanghennie.zhang@mail.cim3
3A0070AlexHuangalex.huang@mail.com1
\n", "
" ], "text/plain": [ " eid first last mail seniority\n", "2 A0030 Jennie Zhang hennie.zhang@mail.cim 3\n", "3 A0070 Alex Huang alex.huang@mail.com 1" ] }, "execution_count": 23, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df.loc[df.seniority <= 4]" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
firstlastmailseniority
eid
A0001TomLeetom.lee@mail.com7
A0003JohnHuangjohn.huang@mail.com7
A0030JennieZhanghennie.zhang@mail.cim3
A0070AlexHuangalex.huang@mail.com1
\n", "
" ], "text/plain": [ " first last mail seniority\n", "eid \n", "A0001 Tom Lee tom.lee@mail.com 7\n", "A0003 John Huang john.huang@mail.com 7\n", "A0030 Jennie Zhang hennie.zhang@mail.cim 3\n", "A0070 Alex Huang alex.huang@mail.com 1" ] }, "execution_count": 24, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df.set_index('eid') # don't forget inplace" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
eidfirstlastmailseniority
2A0030JennieZhanghennie.zhang@mail.cim3
3A0070AlexHuangalex.huang@mail.com1
\n", "
" ], "text/plain": [ " eid first last mail seniority\n", "2 A0030 Jennie Zhang hennie.zhang@mail.cim 3\n", "3 A0070 Alex Huang alex.huang@mail.com 1" ] }, "execution_count": 25, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df.loc[df.seniority < 5]" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "'tom.lee@mail.com'" ] }, "execution_count": 26, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df.iloc[0, 3]" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "del df" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NameCarColorPriceMPG
0JerryHondared800014
1JohnHondared750014
2NancyHondablue1100016
3GregFordgray700016
4BrianFordwhite850015
5JenFordblack1250020
6SandyFordblack1250020
7GabrielFordblack1395020
8WillToyotagreen30010
9LauraToyotared100012
10MelBMWblack5000035
11SteveVolvoblue100015
12ChrisVolvoblue25015
13SandraVolvobrown10012
\n", "
" ], "text/plain": [ " Name Car Color Price MPG\n", "0 Jerry Honda red 8000 14\n", "1 John Honda red 7500 14\n", "2 Nancy Honda blue 11000 16\n", "3 Greg Ford gray 7000 16\n", "4 Brian Ford white 8500 15\n", "5 Jen Ford black 12500 20\n", "6 Sandy Ford black 12500 20\n", "7 Gabriel Ford black 13950 20\n", "8 Will Toyota green 300 10\n", "9 Laura Toyota red 1000 12\n", "10 Mel BMW black 50000 35\n", "11 Steve Volvo blue 1000 15\n", "12 Chris Volvo blue 250 15\n", "13 Sandra Volvo brown 100 12" ] }, "execution_count": 8, "metadata": { }, "output_type": "execute_result" } ], "source": [ "data_source = os.path.join(data_dir, 'car_data.csv')\n", "df = pd.read_csv(data_source)\n", "df" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
FileNameDateSegFilebeLBELBACFMUC...CDEADDELDFSSUSPCLASSNSP
1Variab10.txt1996-12-01CTG0001.txt240.0357.0120.0120.00.00.00.0...0.00.00.00.00.00.01.00.09.02.0
2Fmcs_1.txt1996-05-03CTG0002.txt5.0632.0132.0132.04.00.04.0...0.00.00.01.00.00.00.00.06.01.0
3Fmcs_1.txt1996-05-03CTG0003.txt177.0779.0133.0133.02.00.05.0...0.00.00.01.00.00.00.00.06.01.0
4Fmcs_1.txt1996-05-03CTG0004.txt411.01192.0134.0134.02.00.06.0...0.00.00.01.00.00.00.00.06.01.0
5Fmcs_1.txt1996-05-03CTG0005.txt533.01147.0132.0132.04.00.05.0...0.00.00.00.00.00.00.00.02.01.0
\n", "

5 rows × 40 columns

\n", "
" ], "text/plain": [ " FileName Date SegFile b e LBE LB AC \\\n", "1 Variab10.txt 1996-12-01 CTG0001.txt 240.0 357.0 120.0 120.0 0.0 \n", "2 Fmcs_1.txt 1996-05-03 CTG0002.txt 5.0 632.0 132.0 132.0 4.0 \n", "3 Fmcs_1.txt 1996-05-03 CTG0003.txt 177.0 779.0 133.0 133.0 2.0 \n", "4 Fmcs_1.txt 1996-05-03 CTG0004.txt 411.0 1192.0 134.0 134.0 2.0 \n", "5 Fmcs_1.txt 1996-05-03 CTG0005.txt 533.0 1147.0 132.0 132.0 4.0 \n", "\n", " FM UC ... C D E AD DE LD FS SUSP CLASS NSP \n", "1 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 9.0 2.0 \n", "2 0.0 4.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 6.0 1.0 \n", "3 0.0 5.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 6.0 1.0 \n", "4 0.0 6.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 6.0 1.0 \n", "5 0.0 5.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 1.0 \n", "\n", "[5 rows x 40 columns]" ] }, "execution_count": 13, "metadata": { }, "output_type": "execute_result" } ], "source": [ "xls_source = 'data/CTG.xls'\n", "df = pd.read_excel(xls_source, sheet_name='Raw Data')\n", "df.dropna(inplace=True)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ "df.to_excel(os.path.join(data_dir, 'wq_0121_exercise.xls'))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 14.1 s, sys: 69.3 ms, total: 14.1 s\n", "Wall time: 14.7 s\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
order_idorder_dateship_dateship_modecustomer_namesegmentstatecountrymarketregion...categorysub_categoryproduct_namesalesquantitydiscountprofitshipping_costorder_priorityyear
51285CA-2014-1154272014-12-312015-01-04Standard ClassErica BernCorporateCaliforniaUnited StatesUSWest...Office SuppliesBindersCardinal Slant-D Ring Binder, Heavy Gauge Vinyl13.90420.24.51880.890Medium2014
51286MO-2014-25602014-12-312015-01-05Standard ClassLiz PreisConsumerSouss-Massa-DraâMoroccoAfricaAfrica...Office SuppliesBindersWilson Jones Hole Reinforcements, Clear3.99010.00.42000.490Medium2014
51287MX-2014-1105272014-12-312015-01-02Second ClassCharlotte MeltonConsumerManaguaNicaraguaLATAMCentral...Office SuppliesLabelsHon Color Coded Labels, 5000 Label Set26.40030.012.36000.350Medium2014
51288MX-2014-1147832014-12-312015-01-06Standard ClassTamara DahlenConsumerChihuahuaMexicoLATAMNorth...Office SuppliesLabelsHon Legal Exhibit Labels, Alphabetical7.12010.00.56000.199Medium2014
51289CA-2014-1567202014-12-312015-01-04Standard ClassJill MatthiasConsumerColoradoUnited StatesUSWest...Office SuppliesFastenersBagged Rubber Bands3.02430.2-0.60480.170Medium2014
\n", "

5 rows × 21 columns

\n", "
" ], "text/plain": [ " order_id order_date ship_date ship_mode customer_name \\\n", "51285 CA-2014-115427 2014-12-31 2015-01-04 Standard Class Erica Bern \n", "51286 MO-2014-2560 2014-12-31 2015-01-05 Standard Class Liz Preis \n", "51287 MX-2014-110527 2014-12-31 2015-01-02 Second Class Charlotte Melton \n", "51288 MX-2014-114783 2014-12-31 2015-01-06 Standard Class Tamara Dahlen \n", "51289 CA-2014-156720 2014-12-31 2015-01-04 Standard Class Jill Matthias \n", "\n", " segment state country market region ... \\\n", "51285 Corporate California United States US West ... \n", "51286 Consumer Souss-Massa-Draâ Morocco Africa Africa ... \n", "51287 Consumer Managua Nicaragua LATAM Central ... \n", "51288 Consumer Chihuahua Mexico LATAM North ... \n", "51289 Consumer Colorado United States US West ... \n", "\n", " category sub_category \\\n", "51285 Office Supplies Binders \n", "51286 Office Supplies Binders \n", "51287 Office Supplies Labels \n", "51288 Office Supplies Labels \n", "51289 Office Supplies Fasteners \n", "\n", " product_name sales quantity \\\n", "51285 Cardinal Slant-D Ring Binder, Heavy Gauge Vinyl 13.904 2 \n", "51286 Wilson Jones Hole Reinforcements, Clear 3.990 1 \n", "51287 Hon Color Coded Labels, 5000 Label Set 26.400 3 \n", "51288 Hon Legal Exhibit Labels, Alphabetical 7.120 1 \n", "51289 Bagged Rubber Bands 3.024 3 \n", "\n", " discount profit shipping_cost order_priority year \n", "51285 0.2 4.5188 0.890 Medium 2014 \n", "51286 0.0 0.4200 0.490 Medium 2014 \n", "51287 0.0 12.3600 0.350 Medium 2014 \n", "51288 0.0 0.5600 0.199 Medium 2014 \n", "51289 0.2 -0.6048 0.170 Medium 2014 \n", "\n", "[5 rows x 21 columns]" ] }, "execution_count": 15, "metadata": { }, "output_type": "execute_result" } ], "source": [ "%%time\n", "data_source = 'data/superstore_sales.xlsx'\n", "df = pd.read_excel(data_source, sheet_name='Orders')\n", "# df = pd.read_excel(data_source, sheet_name='Orders', engine='openpyxl')\n", "df.tail(5)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
salesquantitydiscountprofitshipping_costyear
count51290.00000051290.00000051290.00000051290.00000051290.00000051290.000000
mean246.4905813.4765450.14290828.64174026.3758182012.777208
std487.5653612.2787660.212280174.42411357.2968101.098931
min0.4440001.0000000.000000-6599.9780000.0020002011.000000
25%30.7586252.0000000.0000000.0000002.6100002012.000000
50%85.0530003.0000000.0000009.2400007.7900002013.000000
75%251.0532005.0000000.20000036.81000024.4500002014.000000
max22638.48000014.0000000.8500008399.976000933.5700002014.000000
\n", "
" ], "text/plain": [ " sales quantity discount profit shipping_cost \\\n", "count 51290.000000 51290.000000 51290.000000 51290.000000 51290.000000 \n", "mean 246.490581 3.476545 0.142908 28.641740 26.375818 \n", "std 487.565361 2.278766 0.212280 174.424113 57.296810 \n", "min 0.444000 1.000000 0.000000 -6599.978000 0.002000 \n", "25% 30.758625 2.000000 0.000000 0.000000 2.610000 \n", "50% 85.053000 3.000000 0.000000 9.240000 7.790000 \n", "75% 251.053200 5.000000 0.200000 36.810000 24.450000 \n", "max 22638.480000 14.000000 0.850000 8399.976000 933.570000 \n", "\n", " year \n", "count 51290.000000 \n", "mean 2012.777208 \n", "std 1.098931 \n", "min 2011.000000 \n", "25% 2012.000000 \n", "50% 2013.000000 \n", "75% 2014.000000 \n", "max 2014.000000 " ] }, "execution_count": 16, "metadata": { }, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "collapsed": false }, "outputs": [ ], "source": [ ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (system-wide)", "language": "python", "metadata": { "cocalc": { "description": "Python 3 programming language", "priority": 100, "url": "https://www.python.org/" } }, "name": "python3", "resource_dir": "/ext/jupyter/kernels/python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }