Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Jupyter notebook CDS-102/Lab Week 05 - Introduction to Data Analysis using R/CDS-102 Lab Week 05 Workbook.ipynb

Views: 47
Kernel: R (SageMath)

CDS-102: Lab 5 Workbook

Name: Bassil Alomari

March 2, 2017

# Run this code block to load the Tidyverse package .libPaths(new = "~/Rlibs") library(tidyverse)
Loading tidyverse: ggplot2 Loading tidyverse: tibble Loading tidyverse: tidyr Loading tidyverse: readr Loading tidyverse: purrr Loading tidyverse: dplyr Conflicts with tidy packages --------------------------------------------------- filter(): dplyr, stats lag(): dplyr, stats
office_data <- read_csv('OfficeSupplies.csv')
Parsed with column specification: cols( OrderDate = col_character(), Region = col_character(), Rep = col_character(), Item = col_character(), Units = col_integer(), `Unit Price` = col_double() )
print(office_data)
# A tibble: 43 × 6 OrderDate Region Rep Item Units `Unit Price` <chr> <chr> <chr> <chr> <int> <dbl> 1 4-Jul-2014 East Richard Pen Set 62 4.99 2 12-Jul-2014 East Nick Binder 29 1.99 3 21-Jul-2014 Central Morgan Pen Set 55 12.49 4 29-Jul-2014 East Susan Binder 81 19.99 5 7-Aug-2014 Central Matthew Pen Set 42 23.95 6 15-Aug-2014 East Richard Pencil 35 4.99 7 24-Aug-2014 West James Desk 3 275.00 8 1-Sep-2014 Central Smith Desk 2 125.00 9 10-Sep-2014 Central Bill Pencil 7 1.29 10 18-Sep-2014 East Richard Pen Set 16 15.99 # ... with 33 more rows
Office_data <- rownames_to_column(as_tibble(office_data),var = "orderdate") print(office_data)
# A tibble: 43 × 6 OrderDate Region Rep Item Units `Unit Price` <chr> <chr> <chr> <chr> <int> <dbl> 1 4-Jul-2014 East Richard Pen Set 62 4.99 2 12-Jul-2014 East Nick Binder 29 1.99 3 21-Jul-2014 Central Morgan Pen Set 55 12.49 4 29-Jul-2014 East Susan Binder 81 19.99 5 7-Aug-2014 Central Matthew Pen Set 42 23.95 6 15-Aug-2014 East Richard Pencil 35 4.99 7 24-Aug-2014 West James Desk 3 275.00 8 1-Sep-2014 Central Smith Desk 2 125.00 9 10-Sep-2014 Central Bill Pencil 7 1.29 10 18-Sep-2014 East Richard Pen Set 16 15.99 # ... with 33 more rows
print(select(office_data, Region, Rep, Item))
# A tibble: 43 × 3 Region Rep Item <chr> <chr> <chr> 1 East Richard Pen Set 2 East Nick Binder 3 Central Morgan Pen Set 4 East Susan Binder 5 Central Matthew Pen Set 6 East Richard Pencil 7 West James Desk 8 Central Smith Desk 9 Central Bill Pencil 10 East Richard Pen Set # ... with 33 more rows
(select(office_data, Region, Rep, Item)) print(distinct(Region, Rep, Item))
Region Rep Item 1 East Richard Pen Set 2 East Nick Binder 3 Central Morgan Pen Set 4 East Susan Binder 5 Central Matthew Pen Set 6 East Richard Pencil 7 West James Desk 8 Central Smith Desk 9 Central Bill Pencil 10 East Richard Pen Set 11 West James Pen 12 Central Morgan Binder 13 West Thomas Binder 14 East Richard Pen 15 Central Rachel Pencil 16 East Susan Pen 17 Central Alex Binder 18 Central Matthew Pen Set 19 Central Alex Binder 20 Central Smith Pencil 21 Central Rachel Binder 22 East Susan Pen Set 23 East Richard Pencil 24 Central Bill Binder 25 Central Matthew Binder 26 Central Smith Binder 27 Central Alex Pencil 28 East Richard Binder 29 Central Bill Pen 30 West James Binder 31 West James Pencil 32 Central Alex Pen Set 33 East Richard Binder 34 Central Rachel Pencil 35 Central Rachel Pencil 36 East Nick Pen 37 Central Alex Pencil 38 Central Bill Pencil 39 West Thomas Pencil 40 Central Bill Binder 41 East Richard Binder 42 Central Matthew Desk 43 Central Morgan Pencil
Error in distinct_(.data, .dots = lazyeval::lazy_dots(...), .keep_all = .keep_all): object 'Region' not found Traceback: 1. print(distinct(Region, Rep, Item)) 2. distinct(Region, Rep, Item) 3. distinct_(.data, .dots = lazyeval::lazy_dots(...), .keep_all = .keep_all)
office_data.mutated <- mutate(office_data, TotalCost = Units * `Unit Price`) print(select(office_data.mutated,OrderDate, Units,`Unit Price`, TotalCost))
# A tibble: 43 × 4 OrderDate Units `Unit Price` TotalCost <chr> <int> <dbl> <dbl> 1 4-Jul-2014 62 4.99 309.38 2 12-Jul-2014 29 1.99 57.71 3 21-Jul-2014 55 12.49 686.95 4 29-Jul-2014 81 19.99 1619.19 5 7-Aug-2014 42 23.95 1005.90 6 15-Aug-2014 35 4.99 174.65 7 24-Aug-2014 3 275.00 825.00 8 1-Sep-2014 2 125.00 250.00 9 10-Sep-2014 7 1.29 9.03 10 18-Sep-2014 16 15.99 255.84 # ... with 33 more rows
by_Region <- group_by(office_data, Region) print(summarise(by_Region,sum(Units), sum(`Unit Price`)))
# A tibble: 3 × 3 Region `sum(Units)` `sum(\\`Unit Price\\`)` <chr> <int> <dbl> 1 Central 1199 432.45 2 East 691 118.87 3 West 231 321.95
Office_data.filtered <- filter(by_Region, Region == "Central") print(Office_data.filtered)
Source: local data frame [24 x 6] Groups: Region [1] OrderDate Region Rep Item Units `Unit Price` <chr> <chr> <chr> <chr> <int> <dbl> 1 21-Jul-2014 Central Morgan Pen Set 55 12.49 2 7-Aug-2014 Central Matthew Pen Set 42 23.95 3 1-Sep-2014 Central Smith Desk 2 125.00 4 10-Sep-2014 Central Bill Pencil 7 1.29 5 5-Oct-2014 Central Morgan Binder 28 8.99 6 31-Oct-2014 Central Rachel Pencil 14 1.29 7 17-Nov-2014 Central Alex Binder 11 4.99 8 25-Nov-2014 Central Matthew Pen Set 96 4.99 9 4-Dec-2014 Central Alex Binder 94 19.99 10 12-Dec-2014 Central Smith Pencil 67 1.29 # ... with 14 more rows
by_Region <- group_by(office_data, Region, Rep, Units, `Unit Price` ) print(summarise(by_Region))
Source: local data frame [43 x 4] Groups: Region, Rep, Units [?] Region Rep Units `Unit Price` <chr> <chr> <int> <dbl> 1 Central Alex 11 4.99 2 Central Alex 36 4.99 3 Central Alex 50 4.99 4 Central Alex 90 4.99 5 Central Alex 94 19.99 6 Central Bill 7 1.29 7 Central Bill 27 19.99 8 Central Bill 46 8.99 9 Central Bill 53 1.29 10 Central Bill 80 8.99 # ... with 33 more rows
by_Region <- group_by(office_data, Rep , Units, `Unit Price`) print(summarise(by_Region))
Source: local data frame [43 x 3] Groups: Rep, Units [?] Rep Units `Unit Price` <chr> <int> <dbl> 1 Alex 11 4.99 2 Alex 36 4.99 3 Alex 50 4.99 4 Alex 90 4.99 5 Alex 94 19.99 6 Bill 7 1.29 7 Bill 27 19.99 8 Bill 46 8.99 9 Bill 53 1.29 10 Bill 80 8.99 # ... with 33 more rows
Office_data.filtered <- filter(by_Region, Rep == "Alex") print(Office_data.filtered)
Source: local data frame [5 x 6] Groups: Rep, Units, Unit Price [5] OrderDate Region Rep Item Units `Unit Price` <chr> <chr> <chr> <chr> <int> <dbl> 1 17-Nov-2014 Central Alex Binder 11 4.99 2 4-Dec-2014 Central Alex Binder 94 19.99 3 9-Feb-2015 Central Alex Pencil 36 4.99 4 24-Mar-2015 Central Alex Pen Set 50 4.99 5 5-May-2015 Central Alex Pencil 90 4.99
Office_data.filtered <- filter(by_Region, Rep == "Bill") print(Office_data.filtered)
Source: local data frame [5 x 6] Groups: Rep, Units, Unit Price [5] OrderDate Region Rep Item Units `Unit Price` <chr> <chr> <chr> <chr> <int> <dbl> 1 10-Sep-2014 Central Bill Pencil 7 1.29 2 15-Jan-2015 Central Bill Binder 46 8.99 3 26-Feb-2015 Central Bill Pen 27 19.99 4 14-May-2015 Central Bill Pencil 53 1.29 5 31-May-2015 Central Bill Binder 80 8.99
office_data.mutated <- mutate(office_data, TotalCost = `Unit Price` > 20) print(office_data.mutated, TotalCost)
# A tibble: 43 × 7 OrderDate Region Rep Item Units `Unit Price` TotalCost <chr> <chr> <chr> <chr> <int> <dbl> <lgl> 1 4-Jul-2014 East Richard Pen Set 62 4.99 FALSE 2 12-Jul-2014 East Nick Binder 29 1.99 FALSE 3 21-Jul-2014 Central Morgan Pen Set 55 12.49 FALSE 4 29-Jul-2014 East Susan Binder 81 19.99 FALSE 5 7-Aug-2014 Central Matthew Pen Set 42 23.95 TRUE 6 15-Aug-2014 East Richard Pencil 35 4.99 FALSE 7 24-Aug-2014 West James Desk 3 275.00 TRUE 8 1-Sep-2014 Central Smith Desk 2 125.00 TRUE 9 10-Sep-2014 Central Bill Pencil 7 1.29 FALSE 10 18-Sep-2014 East Richard Pen Set 16 15.99 FALSE # ... with 33 more rows