Data in R

Datatype


* integer
* double
* logical
* char

a <- "Hi"
typeof(a)
## [1] "character"
a <- 1
typeof(a)
## [1] "double"
a <- 1L
typeof(a)
## [1] "integer"

Data Structure


- Vector
- Matrix
- Array
- Dataframe
- List
- Factor

Vector

# Vector:
v <- 1:6
typeof(v)
## [1] "integer"
v1 <- c(1,2,3,4,5,6)
typeof(v)
## [1] "integer"
is.vector(v)
## [1] TRUE
class(v)
## [1] "integer"

Matrix

# Matrix
# Create 4x3 matrix
m <- matrix(1:5,nrow=4,ncol=3,byrow=F)
## Warning in matrix(1:5, nrow = 4, ncol = 3, byrow = F): data length [5] is not a
## sub-multiple or multiple of the number of rows [4]
m
##      [,1] [,2] [,3]
## [1,]    1    5    4
## [2,]    2    1    5
## [3,]    3    2    1
## [4,]    4    3    2
(1:10)+(2:5)
## Warning in (1:10) + (2:5): longer object length is not a multiple of shorter
## object length
##  [1]  3  5  7  9  7  9 11 13 11 13
dim(m)
## [1] 4 3
m[1:3,2]
## [1] 5 1 2
rownames(m) <- c("A","B","C","D")
m
##   [,1] [,2] [,3]
## A    1    5    4
## B    2    1    5
## C    3    2    1
## D    4    3    2
m["A",]
## [1] 1 5 4
m[c("A","B"),]
##   [,1] [,2] [,3]
## A    1    5    4
## B    2    1    5
m[c(1,2),]
##   [,1] [,2] [,3]
## A    1    5    4
## B    2    1    5

Array

# Array
a <- array(1:120,dim=c(5,5,3))
a[2,2,2]
## [1] 32
dimnames(a) <- list(c("A","B","C","D","E"),c("A","B","C","D","E"),c("A","B","C"))
# a
a[c(3,5),2,]
##    A  B  C
## C  8 33 58
## E 10 35 60

Dataframe

# Dataframe
d <- data.frame(1:3,c("A","B","C"),factor(c("D","D","A")),c("F","G",NA))
#d
d[1]
##   X1.3
## 1    1
## 2    2
## 3    3
d[[1]]
## [1] 1 2 3
colnames(d) <- c("F","G","H")
d
##   F G H   NA
## 1 1 A D    F
## 2 2 B D    G
## 3 3 C A <NA>
d[c(T,F,T),]
##   F G H   NA
## 1 1 A D    F
## 3 3 C A <NA>
d[d$F==2,]
##   F G H NA
## 2 2 B D  G

List

# List
l <- list(1:5,c("A","B"),factor(c("A","A","B","C")))
l
## [[1]]
## [1] 1 2 3 4 5
## 
## [[2]]
## [1] "A" "B"
## 
## [[3]]
## [1] A A B C
## Levels: A B C
l[[1]]
## [1] 1 2 3 4 5
l[[2]][2]
## [1] "B"

Factor

# Factor
# Unordered
factor(c("A+","A","A"))
## [1] A+ A  A 
## Levels: A A+
# Ordered: order by ascii order
factor(c("A+","A","A-"),ordered = T)
## [1] A+ A  A-
## Levels: A < A- < A+
# Ordered: order by assign order
factor(c("A+","A","A","A-"),ordered = T,levels = c("A-","A","A+"))
## [1] A+ A  A  A-
## Levels: A- < A < A+

Control Structure

Variable Name

a <- 1
b <- "123"
ABC <- 2
abc <- 234
sdf2 <- 23234
2432ds <- 342
342 <- 344534
fhdus_ <- 3242
_fdusif <- 344
fhus_dsaf <- 2453
for <- 3423
## Error: <text>:6:5: unexpected symbol
## 5: sdf2 <- 23234
## 6: 2432ds
##        ^

Logical

if(1==2){
  print("NMSL")
}else{
  print("NMHL")
}
## [1] "NMHL"

Loop

While

x <- 10
while(x>0){
  print(x)
  x <- x-1
}
## [1] 10
## [1] 9
## [1] 8
## [1] 7
## [1] 6
## [1] 5
## [1] 4
## [1] 3
## [1] 2
## [1] 1

For

fdsfds = c("a","b","c")
for(i in fdsfds){
  print(i)
}
## [1] "a"
## [1] "b"
## [1] "c"

Function

absoluteVal <- function(parameter){
  if(!is.numeric(parameter))
    return(FALSE)
  
  if(parameter<0)
    return(-parameter)
  else
    return(parameter)
  
}
myfunc <- function(input){
  if(input<10)
    return(input)
  else if(input<20){
    input <- input + 10
    return(input)
  }
  else{
    input <- input -10
    return(input)
  }
}

absoluteVal("Dgfdsg")
## [1] FALSE
absoluteVal(2)
## [1] 2
absoluteVal(-3)
## [1] 3

Plot & Graph

Import Dataframe

From csv

df <- read.csv("./sleep.csv")

From database

Graphs

Plot

Graph of points, relation between X and Y

plot(df$age,df$sleep,xlab = "Age",ylab="Hours of sleep",main="Age vs. hours of sleep",pch=1)

Histogram

Distribution of certain continuous (numeric) column

hist(df$age)

Barplot

Discrete data / Categorical data distribution / count

barplot(table(df$standing),main = "123",col=c("red","green"))

Boxplot

summary(df$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   17.00   19.00   21.00   21.27   23.25   25.00
boxplot(df$age)

boxplot(df$sleep~df$gender,col=c("red","green","purple"))

Probability

Range

\(\mathbb{P}(x) \in [0,1]\)

Calculation

\(\mathbb{P}(x \cap y)\)
\(\mathbb{P}(x \cup y) = \mathbb{P}(x) + \mathbb{P}(y) - \mathbb{P}(x \cap y)\)

\(\mathbb{P}(x|y) = \frac{\mathbb{P}(x \cap y)}{\mathbb{P}(y)}\)

Property

Muturally Exclusive
\(\mathbb{P}(x \cap y) = 0\)

Independent
\(\mathbb{P}(x \cap y) = \mathbb{P}(x) \cdot \mathbb{P}(y)\)

Distribution

Functions

  • P.M.F. Probability mass function
  • C.D.F. Cumulative distribution function
  • P.D.F. Probability Density Function

R Functions

  • dxxx \(\rightarrow\) P.M.F or P.D.F
  • pxxx \(\rightarrow\) C.D.F
  • rxxx \(\rightarrow\) Run experiment
  • qxxx \(\rightarrow\) Inverse of pxxx

Uniform Distribution

  • Constant probability
  • Continous
    A bus arrives at a stop every 10 minutes. A student is equally likely to arrive at the stop at any time. How long will the student have to wait?

\(\mathbb{P}(X \leq x)\) \(\rightarrow\) C.D.F(x) \(\rightarrow\) punif(x,min,max)
Prob of waiting for 3 mins or less
\(\mathbb{P}(X \leq 3)\)

punif(3,0,10)
## [1] 0.3

Prob of waiting for 3-5 mins
\(\mathbb{P}(3 \leq X \leq 5) = \mathbb{P}(X \leq 5) - \mathbb{P}(X \leq 3)\)

punif(5,0,10) - punif(3,0,10)
## [1] 0.2
punif(0:10,0,10)
##  [1] 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0

Binomial Distribution

  • Constant number of trials
  • Two outcomes for every trial: Success or Fail
  • Constant probability between trials
  • Trails need to independent

\(\mathbb{P}(X=x)\) \(\rightarrow\) P.M.F(x) \(\rightarrow\) dbinom(x,trials,probability of success)
\(\mathbb{P}(X \leq x)\) \(\rightarrow\) C.D.F(x) \(\rightarrow\) pbinom(x,trials,probability of success)

plot(dbinom(0:10,10,0.5))

Toss a coin 10 times, getting 5 heads

dbinom(5,10,0.5)
## [1] 0.2460938

Toss a coin 10 times, getting less than 5 heads \(\mathbb{P}(X \leq 5)\) \(\rightarrow\) C.D.F(5) \(\rightarrow\) pbinom(5,n,p)

sum(dbinom(0:4,10,0.5))
## [1] 0.3769531
pbinom(4,10,0.5)
## [1] 0.3769531
cumsum(dbinom(0:10,10,0.5))[5]
## [1] 0.3769531

\(\mathbb{P}(X \leq 4.1)\) \(\rightarrow\) C.D.F(4.1) \(\rightarrow\) pbinom(4.1,n,p)

pbinom(4.1,10,0.5)
## [1] 0.3769531
plot(pbinom(0:10,10,0.5),type="s")

Probability of getting 3-7 heads (inclusive)

pbinom(7,10,0.5)-pbinom(2,10,0.5)
## [1] 0.890625
pbinom(7,10,0.5)-pbinom(3,10,0.5) + dbinom(3,10,0.5)
## [1] 0.890625

Normal Distribution

  • Continous
  • 68% 95% 99.7%
  • \(\mathbb{P}(X=x)=0\) \(\rightarrow\) P.D.F(x) \(\rightarrow\) dnorm(x,mean,std)
  • \(\mathbb{P}(X \leq x)\) \(\rightarrow\) C.D.F(x) \(\rightarrow\) pnorm(x,mean,std)
plot(dnorm(1:20,10,2))

plot(pnorm(1:20,10,2),type="s")

pnorm(12,10,2,lower.tail=FALSE)
## [1] 0.1586553
pnorm(8,10,2,lower.tail=TRUE)
## [1] 0.1586553
qnorm(0.1586553,10,2,lower.tail = TRUE)
## [1] 8

Check normality:

data <- rnorm(300,10,2)
library(palmerpenguins)
qqnorm(data)
qqline(data, col = "red")

SQL

Create Connection

library(DBI)
library(RSQLite)
conn <- dbConnect(dbDriver("SQLite"),"./database.sqlite")