1 Load the dataset

survey <-  read.csv ("survey.csv", stringsAsFactors = TRUE)
library("rmarkdown")
paged_table(survey)

2 Frequency

get frequency table of variable ‘Smoke’ of data set ‘survey’

smoke_freq <- table (survey$Smoke)
smoke_freq
## 
## Heavy Never Occas Regul 
##    11   189    19    17

3 Bar graph

Draw bar graph of variable ‘Smoke’ of data set ‘survey’

barplot (smoke_freq)

we can also use a compact expression

barplot (table (survey$Smoke))

Sort in the order of frequency

barplot(sort(smoke_freq))

barplot can produce barplot of any named vector as we created

afreq <- c(10,20,30,40,60); names(afreq) <- paste0("A", 1:5);afreq
## A1 A2 A3 A4 A5 
## 10 20 30 40 60
barplot(afreq)

4 Pie chart

Draw pie chart

pie (smoke_freq)

or you can use a compact expression

pie (table (survey$Smoke))

5 Relative frequency and percentage

smoke_relfreq <- smoke_freq / nrow (survey)
smoke_freq
## 
## Heavy Never Occas Regul 
##    11   189    19    17
smoke_perc <- smoke_freq / nrow (survey) * 100
smoke_perc
## 
##     Heavy     Never     Occas     Regul 
##  4.641350 79.746835  8.016878  7.172996

find angles used in pie chart

smoke_pie <- smoke_freq / nrow (survey) * 360
smoke_pie
## 
##     Heavy     Never     Occas     Regul 
##  16.70886 287.08861  28.86076  25.82278

6 Histogram

draw histogram of variable ‘Pulse’ of data set ‘survey’

hist (survey$Pulse)

hist (survey$Pulse, nclass = 20)

hist (survey$Pulse, breaks = seq (35, 110, by = 5)) 

work with a vector of numbers

x <- scan (text = "2 3 4 2 1 2 3 4 4 2 2 2 2 2 1 1 1 2 4 5 5")
hist (x)

x <- scan ("numbers.txt")
hist (x)

7 Cumulative frequency for a histogram

pulse_hist <- hist (survey$Pulse, breaks = seq (35, 110, by = 5))

look at the results of hist function

pulse_hist
## $breaks
##  [1]  35  40  45  50  55  60  65  70  75  80  85  90  95 100 105 110
## 
## $counts
##  [1]  2  0  4  2 14 21 37 27 38 14 18  6  7  2  0
## 
## $density
##  [1] 0.002083333 0.000000000 0.004166667 0.002083333 0.014583333 0.021875000
##  [7] 0.038541667 0.028125000 0.039583333 0.014583333 0.018750000 0.006250000
## [13] 0.007291667 0.002083333 0.000000000
## 
## $mids
##  [1]  37.5  42.5  47.5  52.5  57.5  62.5  67.5  72.5  77.5  82.5  87.5  92.5
## [13]  97.5 102.5 107.5
## 
## $xname
## [1] "survey$Pulse"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

draw culumative frequency

plot (pulse_hist$mids, cumsum(pulse_hist$counts), type = "b")

draw culumative relative frequency

n <- sum (pulse_hist$counts)
plot (pulse_hist$mids, cumsum(pulse_hist$counts)/n, type = "b")

8 Empirical distribution function

Empirical distribution function computed with observations \(x_1,\ldots,x_n\) is defined as \[ \hat{F}(x) = \frac{\sum_{i=1}^n I(x_i\leq x)}{n} = \frac{\mbox{number of } x_i\leq x }{n} \] where, \(I()\) is the indicator function, equal to 1 if the condition in () is true and 0 otherwise.

x <- c(1,2,3,4,5)

plot (ecdf(x))

plot(ecdf (survey$Pulse))
abline (h = seq (0,1, by = 0.05), v = seq(0,100,by=5), 
        lty = 2, col = "grey")

9 Stem-leaf plot

stem (survey$Pulse)
## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##    3 | 5
##    4 | 0
##    4 | 88
##    5 | 004
##    5 | 569
##    6 | 000000000000122223444444444
##    6 | 555555666666788888888888888889
##    7 | 00000000000001122222222222222344444
##    7 | 5555566666666666668888999
##    8 | 0000000000000000001333344444
##    8 | 55556667788889
##    9 | 00000000222222
##    9 | 66678
##   10 | 0044

10 Scatterplot of two or three variables

plot(survey$Height)

plot(survey$Height, col = as.factor(survey$Sex))

plot(survey$Pulse, col = survey$Sex)

plot(survey$Height~survey$Pulse)

plot(survey$Height~survey$Pulse, col = survey$Sex)

plot(survey$Wr.Hnd, col = survey$Sex)

plot(survey$Wr.Hnd~survey$Height, col = survey$Sex)

plot(survey$Wr.Hnd~survey$NW.Hnd, col = survey$Sex)