Contact
CoCalc Logo Icon
StoreFeaturesDocsShareSupport News AboutSign UpSign In
| Download

Jupyter notebook CDS-102/Lab Week 07 - Tidying your dataset/CDS-102 Lab Week 07 Workbook.ipynb

Views: 58
Kernel: R (SageMath)

CDS-102: Lab 7 Workbook

Name:

March 9, 2017

# Run this code block to load the Tidyverse package .libPaths(new = "~/Rlibs") library(tidyverse)
Loading tidyverse: ggplot2 Loading tidyverse: tibble Loading tidyverse: tidyr Loading tidyverse: readr Loading tidyverse: purrr Loading tidyverse: dplyr Conflicts with tidy packages --------------------------------------------------- filter(): dplyr, stats lag(): dplyr, stats
# Import the dataset original_data <- read_csv("Brauer2008_Dataset1.csv")
Parsed with column specification: cols( .default = col_double(), GID = col_character(), YORF = col_character(), NAME = col_character(), GWEIGHT = col_integer() ) See spec(...) for full column specifications.
original_data
GID YORF 1 GENE1331X A_06_P5820 2 GENE4924X A_06_P5866 3 GENE4690X A_06_P1834 4 GENE1177X A_06_P4928 5 GENE511X A_06_P5620 6 GENE2133X A_06_P5307 7 GENE1002X A_06_P6258 8 GENE5478X A_06_P7082 9 GENE2065X A_06_P2554 10 GENE2440X A_06_P6431 11 GENE4180X A_06_P6220 12 GENE5247X A_06_P1410 13 GENE2121X A_06_P2983 14 GENE1985X A_06_P3720 15 GENE4728X A_06_P2774 16 GENE3153X A_06_P4597 17 GENE3704X A_06_P5667 18 GENE2141X A_06_P3260 19 GENE2978X A_06_P3607 20 GENE1203X A_06_P5929 21 GENE3214X A_06_P6219 22 GENE443X A_06_P1322 23 GENE1570X A_06_P6449 24 GENE4434X A_06_P2356 25 GENE2486X A_06_P6921 26 GENE2099X A_06_P1729 27 GENE5137X A_06_P2688 28 GENE2691X A_06_P1007 29 GENE2673X A_06_P1933 30 GENE3094X A_06_P1548 ⋮ ⋮ ⋮ 5508 GENE5335X A_06_P5817 5509 GENE3931X A_06_P4860 5510 GENE2273X A_06_P3220 5511 GENE1180X A_06_P6178 5512 GENE4771X A_06_P2485 5513 GENE321X A_06_P7110 5514 GENE236X A_06_P6294 5515 GENE2516X A_06_P2042 5516 GENE1687X A_06_P4130 5517 GENE5522X A_06_P4129 5518 GENE2461X A_06_P1902 5519 GENE5154X A_06_P3834 5520 GENE2896X A_06_P5553 5521 GENE4037X A_06_P4180 5522 GENE674X A_06_P7057 5523 GENE3957X A_06_P6469 5524 GENE2250X A_06_P7164 5525 GENE785X A_06_P7198 5526 GENE4483X A_06_P1283 5527 GENE491X A_06_P3540 5528 GENE4050X A_06_P2650 5529 GENE17X A_06_P6055 5530 GENE4426X A_06_P6690 5531 GENE1274X A_06_P6825 5532 GENE410X A_06_P4625 5533 GENE2833X A_06_P6094 5534 GENE271X A_06_P3243 5535 GENE1691X A_06_P4196 5536 GENE1755X A_06_P4680 5537 GENE4255X A_06_P6304 NAME 1 SFB2 || ER to Golgi transport || molecular function unknown || YNL049C || 1082129 2 || biological process unknown || molecular function unknown || YNL095C || 1086222 3 QRI7 || proteolysis and peptidolysis || metalloendopeptidase activity || YDL104C || 1085955 4 CFT2 || mRNA polyadenylylation* || RNA binding || YLR115W || 1081958 5 SSO2 || vesicle fusion* || t-SNARE activity || YMR183C || 1081214 6 PSP2 || biological process unknown || molecular function unknown || YML017W || 1083036 7 RIB2 || riboflavin biosynthesis || pseudouridylate synthase activity* || YOL066C || 1081766 8 VMA13 || vacuolar acidification || hydrogen-transporting ATPase activity, rotational mechanism || YPR036W || 1086860 9 EDC3 || deadenylylation-independent decapping || molecular function unknown || YEL015W || 1082963 10 VPS5 || protein retention in Golgi* || protein transporter activity || YOR069W || 1083389 11 || biological process unknown || molecular function unknown || YOL029C || 1085380 12 AMN1 || negative regulation of exit from mitosis* || protein binding || YBR158W || 1086594 13 SCW11 || cytokinesis, completion of separation || glucan 1,3-beta-glucosidase activity || YGL028C || 1083024 14 DSE2 || cell wall organization and biogenesis* || glucan 1,3-beta-glucosidase activity || YHR143W || 1082870 15 COX15 || cytochrome c oxidase complex assembly* || oxidoreductase activity, acting on NADH or NADPH, heme protein as acceptor || YER141W || 1085995 16 SPE1 || pantothenate biosynthesis* || ornithine decarboxylase activity || YKL184W || 1084207 17 MTF1 || transcription from mitochondrial promoter || S-adenosylmethionine-dependent methyltransferase activity* || YMR228W || 1084832 18 KSS1 || invasive growth (sensu Saccharomyces)* || MAP kinase activity || YGR040W || 1083046 19 || biological process unknown || molecular function unknown || YHR036W || 1084002 20 || biological process unknown || molecular function unknown || YNL158W || 1081987 21 YAP7 || positive regulation of transcription from RNA polymerase II promoter || RNA polymerase II transcription factor activity || YOL028C || 1084281 22 || proteolysis and peptidolysis || metalloendopeptidase activity || YBR074W || 1081132 23 YVC1 || cation homeostasis || calcium channel activity* || YOR087W || 1082401 24 CDC40 || nuclear mRNA splicing, via spliceosome* || RNA splicing factor activity, transesterification mechanism* || YDR364C || 1085655 25 || biological process unknown || molecular function unknown || YPL162C || 1083440 26 RMD1 || biological process unknown || molecular function unknown || YDL001W || 1083001 27 PCL6 || regulation of glycogen biosynthesis* || cyclin-dependent protein kinase regulator activity || YER059W || 1086466 28 AI4 || RNA splicing* || endonuclease activity || Q0065 || 1083679 29 GGC1 || mitochondrial genome maintenance* || guanine nucleotide transporter activity || YDL198C || 1083659 30 SUL1 || sulfate transport || sulfate transporter activity || YBR294W || 1084134 ⋮ ⋮ 5508 || biological process unknown || molecular function unknown || YNL046W || 1086698 5509 RPS0B || protein biosynthesis* || structural constituent of ribosome || YLR048W || 1085091 5510 COS12 || biological process unknown || molecular function unknown || YGL263W || 1083201 5511 || biological process unknown || molecular function unknown || YNR065C || 1081961 5512 IZH1 || lipid metabolism* || metal ion binding || YDR492W || 1086044 5513 || || || YPR064W || 1080987 5514 IZH4 || lipid metabolism* || metal ion binding || YOL101C || 1080893 5515 PST1 || cell wall organization and biogenesis || molecular function unknown || YDR055W || 1083475 5516 PRM10 || conjugation with cellular fusion || molecular function unknown || YJL108C || 1082535 5517 || biological process unknown || molecular function unknown || YJL107C || 1086909 5518 SFA1 || formaldehyde catabolism || alcohol dehydrogenase activity* || YDL168W || 1083412 5519 CAP2 || filamentous growth* || actin filament binding || YIL034C || 1086485 5520 || biological process unknown || molecular function unknown || YMR122W-A || 1083907 5521 CIS3 || cell wall organization and biogenesis || structural constituent of cell wall || YJL158C || 1085216 5522 || || || YPR012W || 1081403 5523 RGS2 || G-protein signaling, coupled to cAMP nucleotide second messenger || GTPase activator activity || YOR107W || 1085121 5524 || biological process unknown || molecular function unknown || YPR117W || 1083173 5525 || || || YPR150W || 1081527 5526 CSG2 || calcium ion homeostasis* || enzyme regulator activity || YBR036C || 1085710 5527 SPO11 || meiotic DNA double-strand break formation || endodeoxyribonuclease activity, producing 3'-phosphomonoesters || YHL022C || 1081188 5528 CHO1 || phosphatidylserine biosynthesis || CDP-diacylglycerol-serine O-phosphatidyltransferase activity || YER026C || 1085231 5529 WSC2 || cell wall organization and biogenesis* || transmembrane receptor activity || YNL283C || 1080641 5530 MYO2 || endocytosis* || microfilament motor activity || YOR326W || 1085645 5531 || biological process unknown || molecular function unknown || YPL066W || 1082066 5532 DOA1 || ubiquitin-dependent protein catabolism* || molecular function unknown || YKL213C || 1081094 5533 KRE1 || cell wall organization and biogenesis || structural constituent of cell wall || YNL322C || 1083836 5534 MTL1 || cell wall organization and biogenesis || molecular function unknown || YGR023W || 1080930 5535 KRE9 || cell wall organization and biogenesis* || molecular function unknown || YJL174W || 1082539 5536 UTH1 || mitochondrion organization and biogenesis* || molecular function unknown || YKR042W || 1082610 5537 || biological process unknown || molecular function unknown || YOL111C || 1085465 GWEIGHT G0.05 G0.1 G0.15 G0.2 G0.25 G0.3 ⋯ L0.15 L0.2 L0.25 L0.3 1 1 -0.24 -0.13 -0.21 -0.15 -0.05 -0.05 ⋯ 0.13 0.20 0.17 0.11 2 1 0.28 0.13 -0.40 -0.48 -0.11 0.17 ⋯ 0.02 0.04 0.03 0.01 3 1 -0.02 -0.27 -0.27 -0.02 0.24 0.25 ⋯ -0.07 -0.05 -0.13 -0.04 4 1 -0.33 -0.41 -0.24 -0.03 -0.03 0.00 ⋯ -0.05 0.02 0.00 0.08 5 1 0.05 0.02 0.40 0.34 -0.13 -0.14 ⋯ 0.00 -0.11 0.04 0.01 6 1 -0.69 -0.03 0.23 0.20 0.00 -0.27 ⋯ 0.25 -0.21 0.12 -0.11 7 1 -0.55 -0.30 -0.12 -0.03 -0.16 -0.11 ⋯ 0.27 0.24 0.05 0.19 8 1 -0.75 -0.12 -0.07 0.02 -0.32 -0.41 ⋯ 0.15 0.15 0.00 0.03 9 1 -0.24 -0.22 0.14 0.06 0.00 -0.13 ⋯ 0.17 0.07 0.10 0.11 10 1 -0.16 -0.38 0.05 0.14 -0.04 -0.01 ⋯ 0.11 0.00 0.02 0.09 11 1 -0.22 -0.18 0.27 0.18 0.03 -0.04 ⋯ -0.04 -0.13 -0.08 0.10 12 1 0.18 0.61 1.55 1.34 0.23 -0.03 ⋯ -0.35 -0.27 -0.07 -0.11 13 1 -0.67 -0.47 1.16 1.05 -0.18 -0.68 ⋯ -0.11 0.01 -0.27 -0.51 14 1 -0.59 -0.17 1.17 0.85 -0.12 -0.61 ⋯ -0.39 -0.42 -0.48 -0.65 15 1 -0.28 -0.81 -0.39 0.24 0.01 0.01 ⋯ -0.18 -0.02 0.15 -0.18 16 1 -0.19 0.24 0.03 0.17 0.00 -0.01 ⋯ 0.04 -0.07 0.06 -0.20 17 1 -0.42 -0.43 -0.36 -0.12 0.05 0.24 ⋯ -0.07 -0.14 -0.03 -0.04 18 1 -0.76 -0.32 -0.05 -0.27 -0.31 -0.01 ⋯ 0.15 0.06 0.20 -0.11 19 1 -0.91 -0.43 -0.05 -0.09 -0.27 -0.45 ⋯ -0.08 -0.12 -0.13 -0.05 20 1 -0.47 -0.43 -0.15 0.08 -0.26 -0.25 ⋯ 0.02 -0.17 -0.30 -0.41 21 1 -0.51 -0.04 0.06 0.26 -0.19 -0.22 ⋯ -0.16 -0.13 -0.13 -0.03 22 1 -1.01 -0.55 -0.72 -0.54 -0.55 -0.19 ⋯ -0.08 -0.25 0.12 -0.09 23 1 -0.40 -0.14 -0.06 0.00 -0.22 -0.07 ⋯ -0.10 -0.15 0.03 0.09 24 1 -0.19 -0.08 -0.16 -0.10 -0.12 -0.10 ⋯ -0.38 -0.19 -0.01 0.01 25 1 -0.10 -0.02 -0.37 -0.09 -0.14 0.10 ⋯ -0.22 -0.04 -0.10 0.05 26 1 -0.22 -0.03 -0.26 -0.19 -0.10 0.15 ⋯ -0.21 -0.16 -0.10 -0.03 27 1 -0.25 -0.35 0.04 0.15 0.02 0.09 ⋯ 0.12 0.02 0.04 0.13 28 1 -0.36 -0.39 0.42 -0.09 0.00 0.12 ⋯ 0.49 0.22 -0.09 -0.80 29 1 0.76 0.33 -0.21 -0.16 0.00 0.09 ⋯ -0.15 -0.08 -0.72 -1.25 30 1 -0.32 -0.54 -0.37 -0.64 -0.09 -0.10 ⋯ -0.56 -0.77 -0.95 -1.32 ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋱ ⋮ ⋮ ⋮ ⋮ 5508 1 0.09 0.23 0.13 0.27 0.11 -0.08 ⋯ 0.02 0.25 0.10 -0.03 5509 1 -0.12 -0.42 -0.12 0.09 0.01 0.08 ⋯ 0.24 0.32 0.16 0.12 5510 1 -1.30 0.09 0.29 -0.03 -0.14 -0.20 ⋯ 0.77 1.07 0.95 1.17 5511 1 -0.16 -0.02 0.15 0.17 -0.21 -0.12 ⋯ 0.13 -0.06 -0.21 -0.11 5512 1 -0.10 -0.13 0.28 0.46 0.23 -0.25 ⋯ -0.47 -0.62 -0.60 -0.51 5513 1 0.13 0.17 0.53 1.21 0.38 -0.02 ⋯ -0.28 -0.11 -0.31 -0.37 5514 1 0.64 0.44 1.31 1.35 0.97 0.26 ⋯ -0.13 -0.73 -0.46 -0.42 5515 1 0.62 0.25 1.12 1.05 0.13 -0.12 ⋯ 0.28 0.15 -0.05 -0.29 5516 1 0.63 0.59 0.73 0.65 0.23 0.29 ⋯ -0.24 -0.40 -0.42 -0.08 5517 1 0.65 0.62 0.71 0.71 0.04 0.10 ⋯ -0.29 -0.56 -0.73 -0.21 5518 1 0.81 0.30 0.16 0.36 0.38 0.37 ⋯ -0.07 -0.01 -0.09 0.01 5519 1 0.10 -0.33 0.05 0.25 0.09 -0.05 ⋯ 0.24 0.12 -0.01 -0.05 5520 1 0.01 -0.22 0.67 0.72 0.36 0.04 ⋯ -0.19 0.10 0.02 0.05 5521 1 -0.01 -0.21 0.20 0.60 0.48 0.18 ⋯ -0.02 -0.06 0.07 0.03 5522 1 -0.23 0.05 0.25 0.28 0.22 0.27 ⋯ -0.14 0.02 0.13 0.04 5523 1 0.00 0.01 -0.13 0.02 -0.07 -0.32 ⋯ -0.09 -0.32 -0.30 -0.29 5524 1 -0.22 -0.51 0.28 0.32 0.00 -0.38 ⋯ -0.49 -0.57 -0.53 -0.48 5525 1 -0.10 -0.24 -0.06 -0.04 -0.37 -0.67 ⋯ -0.21 -0.16 -0.17 -0.52 5526 1 0.06 -0.22 -0.09 0.15 -0.12 -0.27 ⋯ 0.15 0.16 0.03 -0.15 5527 1 -0.83 0.21 0.04 -0.19 -0.41 -0.15 ⋯ 0.04 0.27 -0.17 -0.23 5528 1 -0.74 -0.63 -0.07 0.11 -0.18 -0.31 ⋯ 0.26 0.11 0.04 -0.16 5529 1 -0.62 -0.26 -0.19 -0.03 -0.09 -0.27 ⋯ -0.33 -0.21 -0.16 -0.20 5530 1 -0.67 -0.38 -0.12 -0.05 -0.15 -0.24 ⋯ -0.12 -0.24 -0.15 -0.24 5531 1 -0.14 0.06 0.78 0.81 0.30 0.14 ⋯ 0.28 0.13 0.30 0.06 5532 1 0.12 -0.07 0.14 0.29 0.17 0.09 ⋯ 0.19 0.17 0.03 0.02 5533 1 0.41 -0.28 0.30 0.50 -0.05 -0.08 ⋯ 0.38 0.23 0.21 0.15 5534 1 0.50 -0.12 0.25 0.24 0.13 0.02 ⋯ 0.25 -0.02 -0.06 -0.10 5535 1 0.15 0.09 0.21 0.46 0.19 -0.02 ⋯ 0.37 0.21 0.16 -0.01 5536 1 0.63 0.38 0.05 0.12 0.13 -0.01 ⋯ -0.07 0.02 0.24 0.18 5537 1 0.18 0.05 0.11 0.09 -0.02 0.03 ⋯ -0.07 -0.08 -0.02 -0.06 U0.05 U0.1 U0.15 U0.2 U0.25 U0.3 1 -0.06 -0.26 -0.05 -0.28 -0.19 0.09 2 -1.02 -0.91 -0.59 -0.61 -0.17 0.18 3 -0.91 -0.94 -0.42 -0.36 -0.49 -0.47 4 -0.53 -0.51 -0.26 0.05 -0.14 -0.01 5 -0.45 -0.09 -0.13 0.02 -0.09 -0.03 6 NA -0.65 0.09 0.06 -0.07 -0.10 7 0.07 -0.31 -0.08 0.12 0.05 0.06 8 -0.40 -0.02 0.26 0.31 0.14 0.11 9 0.01 -0.16 0.07 0.20 0.02 0.10 10 -0.26 -0.13 -0.10 0.07 -0.04 -0.12 11 -0.02 0.04 0.16 0.02 -0.03 -0.22 12 -1.15 0.41 0.28 0.00 0.17 -0.01 13 -1.48 -0.43 -0.27 -0.32 -0.24 -0.15 14 -1.24 0.41 0.18 0.09 0.13 -0.04 15 -1.91 -0.31 0.09 -0.24 -0.03 0.19 16 -1.53 -0.43 -0.46 -0.73 -0.48 -0.25 17 -0.62 -0.53 -0.30 -0.17 -0.44 -0.35 18 -0.80 -0.18 -0.05 -0.26 -0.58 -0.18 19 -1.04 -0.59 -0.47 -0.29 -0.33 -0.20 20 -0.67 -0.01 -0.20 -0.36 -0.30 -0.04 21 -1.09 -0.26 0.02 -0.09 -0.43 -0.21 22 NA -0.23 -0.32 -0.49 0.01 0.24 23 NA -0.39 -0.30 -0.28 -0.01 0.14 24 -1.07 -0.01 0.05 -0.10 -0.04 0.12 25 -1.19 0.15 -0.09 -0.17 0.05 0.12 26 -0.76 -0.03 -0.13 -0.20 -0.13 -0.03 27 -1.10 -0.19 -0.12 0.08 0.00 0.06 28 -1.59 -1.24 -0.65 -0.62 -0.61 -0.81 29 -2.31 -1.70 -1.37 -1.24 -0.73 -0.83 30 -5.55 -4.59 -3.34 -1.98 -3.09 -1.79 ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ 5508 -0.13 0.56 0.39 0.38 0.40 0.36 5509 0.32 0.41 0.03 -0.12 0.16 0.30 5510 0.51 0.50 1.03 1.64 1.27 1.33 5511 0.22 0.51 0.59 0.14 0.38 0.34 5512 0.39 0.35 0.31 -0.07 0.56 0.44 5513 1.21 1.54 0.93 0.50 0.52 0.27 5514 3.36 2.87 2.25 2.00 2.32 1.15 5515 1.95 2.00 1.40 0.50 0.81 0.64 5516 0.54 1.26 0.77 0.61 0.82 0.54 5517 NA 1.06 0.58 0.39 0.53 0.48 5518 0.12 0.53 0.41 0.02 0.40 0.32 5519 0.09 0.76 0.35 0.04 0.20 0.14 5520 -0.06 0.87 0.88 0.28 0.49 0.49 5521 0.49 1.47 1.13 0.04 0.69 0.32 5522 0.32 0.14 -0.01 0.17 0.02 0.27 5523 0.45 0.24 0.18 -0.24 -0.26 -0.08 5524 0.79 1.04 0.32 -0.14 0.29 -0.16 5525 -0.13 0.67 0.39 -0.11 0.33 0.09 5526 0.11 0.26 -0.04 -0.24 -0.05 -0.01 5527 NA 0.07 0.33 0.19 0.01 -0.34 5528 -0.28 0.81 -0.10 -0.03 -0.13 -0.20 5529 0.67 0.78 0.08 -0.34 -0.09 -0.21 5530 0.11 0.69 0.36 -0.12 0.08 -0.04 5531 -0.75 1.91 0.47 0.25 0.37 0.15 5532 0.16 0.28 0.36 0.22 0.29 0.14 5533 0.32 0.62 0.54 0.01 0.56 0.28 5534 NA 0.50 0.29 -0.14 0.47 0.27 5535 -0.68 0.63 0.41 0.09 0.48 0.43 5536 -0.89 0.19 0.03 0.04 0.13 0.19 5537 0.03 0.14 0.00 -0.21 0.07 0.04
cleaning_data_step1 <- separate(original_data, NAME, c("name", "BP", "MF", "systematic_name", "number"), sep = "\\|\\|")
cleaning_data_step2 <- mutate_each(cleaning_data_step1, funs(trimws), name:systematic_name)
cleaning_data_step3 <- select(cleaning_data_step2, -number, -GID, -YORF, -GWEIGHT)
cleaning_data_step4 <- gather(cleaning_data_step3, sample, expression, G0.05:U0.3)
cleaning_data_step5 <- separate(cleaning_data_step4, sample, c("nutrient", "rate"), sep =1, convert = TRUE)
glimpse(cleaning_data_step5)
Observations: 199,332 Variables: 7 $ name <chr> "SFB2", "", "QRI7", "CFT2", "SSO2", "PSP2", "RIB2",... $ BP <chr> "ER to Golgi transport", "biological process unknow... $ MF <chr> "molecular function unknown", "molecular function u... $ systematic_name <chr> "YNL049C", "YNL095C", "YDL104C", "YLR115W", "YMR183... $ nutrient <chr> "G", "G", "G", "G", "G", "G", "G", "G", "G", "G", "... $ rate <dbl> 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0... $ expression <dbl> -0.24, 0.28, -0.02, -0.33, 0.05, -0.69, -0.55, -0.7...
data.filtered <- filter(cleaning_data_step5, name == "LEU1") print(data.filtered)
# A tibble: 36 × 7 name BP MF <chr> <chr> <chr> 1 LEU1 leucine biosynthesis 3-isopropylmalate dehydratase activity 2 LEU1 leucine biosynthesis 3-isopropylmalate dehydratase activity 3 LEU1 leucine biosynthesis 3-isopropylmalate dehydratase activity 4 LEU1 leucine biosynthesis 3-isopropylmalate dehydratase activity 5 LEU1 leucine biosynthesis 3-isopropylmalate dehydratase activity 6 LEU1 leucine biosynthesis 3-isopropylmalate dehydratase activity 7 LEU1 leucine biosynthesis 3-isopropylmalate dehydratase activity 8 LEU1 leucine biosynthesis 3-isopropylmalate dehydratase activity 9 LEU1 leucine biosynthesis 3-isopropylmalate dehydratase activity 10 LEU1 leucine biosynthesis 3-isopropylmalate dehydratase activity # ... with 26 more rows, and 4 more variables: systematic_name <chr>, # nutrient <chr>, rate <dbl>, expression <dbl>
ggplot(data = data.filtered) + geom_line(mapping = aes(x = rate, y = expression, color = nutrient))
MIME type unknown not supported
Image in a Jupyter notebook