Jupyter notebook CDS-102/Lab Week 05 - Introduction to Data Analysis using R/CDS-102 Lab Week 05 Workbook.ipynb

Project: Bassil Alomari - Introduction to Computational and Data Sciences (Spring 2017)

Path: CDS-102/Lab Week 05 - Introduction to Data Analysis using R/CDS-102 Lab Week 05 Workbook.ipynb

Views: ⁴⁷

Kernel: R (SageMath)

CDS-102: Lab 5 Workbook

Name: Bassil Alomari

March 2, 2017

In [2]:

# Run this code block to load the Tidyverse package
.libPaths(new = "~/Rlibs")
library(tidyverse)

Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages ---------------------------------------------------
filter(): dplyr, stats
lag():    dplyr, stats

In [3]:

office_data <- read_csv('OfficeSupplies.csv')

Parsed with column specification:
cols(
  OrderDate = col_character(),
  Region = col_character(),
  Rep = col_character(),
  Item = col_character(),
  Units = col_integer(),
  `Unit Price` = col_double()
)

In [18]:

print(office_data)

# A tibble: 43 × 6
     OrderDate  Region     Rep    Item Units `Unit Price`
         <chr>   <chr>   <chr>   <chr> <int>        <dbl>
 4-Jul-2014    East Richard Pen Set    62         4.99
12-Jul-2014    East    Nick  Binder    29         1.99
21-Jul-2014 Central  Morgan Pen Set    55        12.49
29-Jul-2014    East   Susan  Binder    81        19.99
 7-Aug-2014 Central Matthew Pen Set    42        23.95
15-Aug-2014    East Richard  Pencil    35         4.99
24-Aug-2014    West   James    Desk     3       275.00
 1-Sep-2014 Central   Smith    Desk     2       125.00
10-Sep-2014 Central    Bill  Pencil     7         1.29
18-Sep-2014    East Richard Pen Set    16        15.99
# ... with 33 more rows

In [19]:

Office_data <- rownames_to_column(as_tibble(office_data),var = "orderdate")
print(office_data)

# A tibble: 43 × 6
     OrderDate  Region     Rep    Item Units `Unit Price`
         <chr>   <chr>   <chr>   <chr> <int>        <dbl>
 4-Jul-2014    East Richard Pen Set    62         4.99
12-Jul-2014    East    Nick  Binder    29         1.99
21-Jul-2014 Central  Morgan Pen Set    55        12.49
29-Jul-2014    East   Susan  Binder    81        19.99
 7-Aug-2014 Central Matthew Pen Set    42        23.95
15-Aug-2014    East Richard  Pencil    35         4.99
24-Aug-2014    West   James    Desk     3       275.00
 1-Sep-2014 Central   Smith    Desk     2       125.00
10-Sep-2014 Central    Bill  Pencil     7         1.29
18-Sep-2014    East Richard Pen Set    16        15.99
# ... with 33 more rows

In [42]:

print(select(office_data, Region, Rep, Item))

# A tibble: 43 × 3
    Region     Rep    Item
     <chr>   <chr>   <chr>
   East Richard Pen Set
   East    Nick  Binder
Central  Morgan Pen Set
   East   Susan  Binder
Central Matthew Pen Set
   East Richard  Pencil
   West   James    Desk
Central   Smith    Desk
Central    Bill  Pencil
  East Richard Pen Set
# ... with 33 more rows

In [20]:

(select(office_data, Region, Rep, Item))
print(distinct(Region, Rep, Item))

   Region  Rep     Item   
East    Richard Pen Set
East    Nick    Binder 
Central Morgan  Pen Set
East    Susan   Binder 
Central Matthew Pen Set
East    Richard Pencil 
West    James   Desk   
Central Smith   Desk   
Central Bill    Pencil 
East    Richard Pen Set
West    James   Pen    
Central Morgan  Binder 
West    Thomas  Binder 
East    Richard Pen    
Central Rachel  Pencil 
East    Susan   Pen    
Central Alex    Binder 
Central Matthew Pen Set
Central Alex    Binder 
Central Smith   Pencil 
Central Rachel  Binder 
East    Susan   Pen Set
East    Richard Pencil 
Central Bill    Binder 
Central Matthew Binder 
Central Smith   Binder 
Central Alex    Pencil 
East    Richard Binder 
Central Bill    Pen    
West    James   Binder 
West    James   Pencil 
Central Alex    Pen Set
East    Richard Binder 
Central Rachel  Pencil 
Central Rachel  Pencil 
East    Nick    Pen    
Central Alex    Pencil 
Central Bill    Pencil 
West    Thomas  Pencil 
Central Bill    Binder 
East    Richard Binder 
Central Matthew Desk   
Central Morgan  Pencil 

Error in distinct_(.data, .dots = lazyeval::lazy_dots(...), .keep_all = .keep_all): object 'Region' not found
Traceback:
1. print(distinct(Region, Rep, Item))
2. distinct(Region, Rep, Item)
3. distinct_(.data, .dots = lazyeval::lazy_dots(...), .keep_all = .keep_all)

In [21]:

office_data.mutated <- mutate(office_data, TotalCost = Units * `Unit Price`)
print(select(office_data.mutated,OrderDate, Units,`Unit Price`, TotalCost))

# A tibble: 43 × 4
     OrderDate Units `Unit Price` TotalCost
         <chr> <int>        <dbl>     <dbl>
 4-Jul-2014    62         4.99    309.38
12-Jul-2014    29         1.99     57.71
21-Jul-2014    55        12.49    686.95
29-Jul-2014    81        19.99   1619.19
 7-Aug-2014    42        23.95   1005.90
15-Aug-2014    35         4.99    174.65
24-Aug-2014     3       275.00    825.00
 1-Sep-2014     2       125.00    250.00
10-Sep-2014     7         1.29      9.03
18-Sep-2014    16        15.99    255.84
# ... with 33 more rows

In [22]:

by_Region <- group_by(office_data, Region)
print(summarise(by_Region,sum(Units), sum(`Unit Price`)))

# A tibble: 3 × 3
   Region `sum(Units)` `sum(\\`Unit Price\\`)`
    <chr>        <int>                   <dbl>
1 Central         1199                  432.45
2    East          691                  118.87
3    West          231                  321.95

In [115]:

Office_data.filtered <- filter(by_Region, Region == "Central")
print(Office_data.filtered)

Source: local data frame [24 x 6]
Groups: Region [1]

     OrderDate  Region     Rep    Item Units `Unit Price`
         <chr>   <chr>   <chr>   <chr> <int>        <dbl>
1  21-Jul-2014 Central  Morgan Pen Set    55        12.49
2   7-Aug-2014 Central Matthew Pen Set    42        23.95
3   1-Sep-2014 Central   Smith    Desk     2       125.00
4  10-Sep-2014 Central    Bill  Pencil     7         1.29
5   5-Oct-2014 Central  Morgan  Binder    28         8.99
6  31-Oct-2014 Central  Rachel  Pencil    14         1.29
7  17-Nov-2014 Central    Alex  Binder    11         4.99
8  25-Nov-2014 Central Matthew Pen Set    96         4.99
9   4-Dec-2014 Central    Alex  Binder    94        19.99
10 12-Dec-2014 Central   Smith  Pencil    67         1.29
# ... with 14 more rows

In [127]:

by_Region <- group_by(office_data, Region, Rep, Units, `Unit Price` )
print(summarise(by_Region))

Source: local data frame [43 x 4]
Groups: Region, Rep, Units [?]

    Region   Rep Units `Unit Price`
     <chr> <chr> <int>        <dbl>
1  Central  Alex    11         4.99
2  Central  Alex    36         4.99
3  Central  Alex    50         4.99
4  Central  Alex    90         4.99
5  Central  Alex    94        19.99
6  Central  Bill     7         1.29
7  Central  Bill    27        19.99
8  Central  Bill    46         8.99
9  Central  Bill    53         1.29
10 Central  Bill    80         8.99
# ... with 33 more rows

In [20]:

by_Region <- group_by(office_data, Rep , Units, `Unit Price`)
print(summarise(by_Region))

Source: local data frame [43 x 3]
Groups: Rep, Units [?]

     Rep Units `Unit Price`
   <chr> <int>        <dbl>
1   Alex    11         4.99
2   Alex    36         4.99
3   Alex    50         4.99
4   Alex    90         4.99
5   Alex    94        19.99
6   Bill     7         1.29
7   Bill    27        19.99
8   Bill    46         8.99
9   Bill    53         1.29
10  Bill    80         8.99
# ... with 33 more rows

In [21]:

Office_data.filtered <- filter(by_Region, Rep == "Alex")
print(Office_data.filtered)

Source: local data frame [5 x 6]
Groups: Rep, Units, Unit Price [5]

    OrderDate  Region   Rep    Item Units `Unit Price`
        <chr>   <chr> <chr>   <chr> <int>        <dbl>
1 17-Nov-2014 Central  Alex  Binder    11         4.99
2  4-Dec-2014 Central  Alex  Binder    94        19.99
3  9-Feb-2015 Central  Alex  Pencil    36         4.99
4 24-Mar-2015 Central  Alex Pen Set    50         4.99
5  5-May-2015 Central  Alex  Pencil    90         4.99

In [30]:

Office_data.filtered <- filter(by_Region, Rep == "Bill")
print(Office_data.filtered)

Source: local data frame [5 x 6]
Groups: Rep, Units, Unit Price [5]

    OrderDate  Region   Rep   Item Units `Unit Price`
        <chr>   <chr> <chr>  <chr> <int>        <dbl>
1 10-Sep-2014 Central  Bill Pencil     7         1.29
2 15-Jan-2015 Central  Bill Binder    46         8.99
3 26-Feb-2015 Central  Bill    Pen    27        19.99
4 14-May-2015 Central  Bill Pencil    53         1.29
5 31-May-2015 Central  Bill Binder    80         8.99

In [4]:

office_data.mutated <- mutate(office_data, TotalCost = `Unit Price` > 20)
print(office_data.mutated, TotalCost)

# A tibble: 43 × 7
     OrderDate  Region     Rep    Item Units `Unit Price` TotalCost
         <chr>   <chr>   <chr>   <chr> <int>        <dbl>     <lgl>
 4-Jul-2014    East Richard Pen Set    62         4.99     FALSE
12-Jul-2014    East    Nick  Binder    29         1.99     FALSE
21-Jul-2014 Central  Morgan Pen Set    55        12.49     FALSE
29-Jul-2014    East   Susan  Binder    81        19.99     FALSE
 7-Aug-2014 Central Matthew Pen Set    42        23.95      TRUE
15-Aug-2014    East Richard  Pencil    35         4.99     FALSE
24-Aug-2014    West   James    Desk     3       275.00      TRUE
 1-Sep-2014 Central   Smith    Desk     2       125.00      TRUE
10-Sep-2014 Central    Bill  Pencil     7         1.29     FALSE
18-Sep-2014    East Richard Pen Set    16        15.99     FALSE
# ... with 33 more rows

In [ ]:

In [ ]:

In [ ]:

CDS-102: Lab 5 Workbook

Name: Bassil Alomari

March 2, 2017

Product

Resources

Company