forked from rdpeng/RepData_PeerAssessment1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPA1_template.Rmd
211 lines (160 loc) · 7.21 KB
/
PA1_template.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
---
title: "Reproducible Research: Peer Assessment 1"
subtitle: "This is an R Markdown document for peer assessment 1 of Coursera's Reproducible Research course."
author: "Carlos Hernández"
output:
html_document:
keep_md: true
---
## Loading and preprocessing the data
First unzip and read the data into `data`
```{r warning=FALSE, error=FALSE, message = FALSE}
invisible(Sys.setlocale("LC_ALL","English"))
unzip("./activity.zip")
data <- read.csv("./activity.csv")
```
A quick view of the data structure
```{r}
str(data)
```
```{r}
summary(data)
```
```{r message=FALSE}
# Add needed packages
library(dplyr)
library(ggplot2)
library(scales)
# colors pallete
colors.pallete.1 <- c(
"#011f4b",
"#03396c",
"#005b96",
"#6497b1",
"#b3cde0",
"#cccccc",
"#8c8c8c")
colors.pallete.2 <- c(
"#03232b",
"#053742",
"#064a5a",
"#085e72",
"#0a728a",
"#0b85a2",
"#0d99b9")
```
## What is mean total number of steps taken per day?
```{r message=FALSE, warning=FALSE, fig.width=11, fig.height= 8}
# grouping by date (sum steps for each day)
data_by_day <- data %>% group_by(date) %>% summarise(total_steps = sum(steps , na.rm = TRUE))
# add a weekday column
data_by_day$Weekday = weekdays(as.Date(data_by_day$date))
# calculate the mean and median of total steps
data_steps_mean <- mean(data_by_day$total_steps, na.rm = TRUE)
data_steps_median <- median(data_by_day$total_steps, na.rm = TRUE)
# make plot
ggplot(data = data_by_day, aes(y=total_steps, x= date, fill = Weekday)) +
geom_col() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8, face = "bold"), legend.position = "top", panel.background = element_rect(fill = '#f1f1f1', colour = 'white')) +
scale_fill_manual(values = colors.pallete.1) +
ggtitle(label = "Total Steps by Day") +
ylab("Total Steps") +
xlab("Day") +
geom_hline(yintercept= data_steps_mean, linetype="solid", color = "orange", size=1.2) +
annotate(geom="text", x=8, y=21500, label=paste("Mean =", round(data_steps_mean, 2)), color="orange") +
geom_hline(yintercept= data_steps_median, linetype="solid", color = "blue", size=1) +
annotate(geom="text", x=8, y=22200, label=paste("Median =", round(data_steps_median, 2)), color="blue")+
geom_text(aes(x = date, y = total_steps, label = total_steps, angle = 90, size = 2, hjust = -0.1), color = "#5ea5dd", show.legend = FALSE)
```
## What is the average daily activity pattern?
```{r message=FALSE,warning=FALSE, fig.width=11, fig.height= 7}
# grouping by interval
total_steps_by_interval <- data %>% group_by(interval) %>% summarise(avg_steps = mean(steps, na.rm=TRUE))
# get the maximum avg
max_avg_interval <- total_steps_by_interval[total_steps_by_interval$avg_steps == max(total_steps_by_interval$avg_steps),]
# make plot
ggplot(total_steps_by_interval, aes(x = interval, y = avg_steps)) +
geom_line(colour=colors.pallete.1[1]) +
ggtitle(label = "Average daily activity pattern by Interval") +
ylab("Average of steps") +
xlab("Interval") +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8, face = "bold"), legend.position = "top", panel.background = element_rect(fill = "#f1f1f1", colour = 'white')) +
annotate(geom="text", y=200, x= max_avg_interval$interval + 300, label= paste("Maximun average = ", round(max_avg_interval$avg_steps ,2)), color="red") +
annotate(geom="text", y=190, x= max_avg_interval$interval + 300, label= paste("Interval = ", round(max_avg_interval$interval ,2)), color="red") +
geom_segment(aes(x = max_avg_interval$interval + 500, y = max_avg_interval$avg_steps + 10, xend = max_avg_interval$interval, yend = max_avg_interval$avg_steps),
arrow = arrow(length = unit(0.5, "cm")))
```
## Imputing missing values
First we will calculate the number of NA values
```{r}
na_cases <- sapply(data$steps, is.na)
na_values <- data$steps[na_cases]
```
*NA values in the data set:* `r length(na_values)`
In this step, our strategy for input missing values will be take the mean for that 5 minutes interval and assign it to the missing value
```{r message=FALSE}
# grouping the data by interval and summarize steps applying mean
interval_avg <- data %>% group_by(interval) %>% summarise(steps = mean(steps, na.rm=TRUE))
#' this function assign the interval mean if the step is NA
#' @param step a step value
#' @param interval a interval value
fill_missing <- function(step, interval){
if(is.na(step)){
step <- interval_avg[interval_avg$interval == interval,]$steps
}
return(step)
}
# make a copy of data
data_filling <- data
# applying the interval_avg function
data_filling$steps = mapply(fill_missing, data$steps, data$interval)
```
Now we can make the histogram with the new values
```{r message=FALSE, , warning=FALSE, fig.width=11, fig.height= 8}
# grouping by date (sum steps for each day)
data_filling_by_day <- data_filling %>% group_by(date) %>% summarise(total_steps = sum(steps))
# add weekday column
data_filling_by_day$Weekday = weekdays(as.Date(data_filling_by_day$date))
# calculate the mean and median for total_steps
data_filling_mean <- mean(data_filling_by_day$total_steps)
data_filling_median <- median(data_filling_by_day$total_steps)
#make plot
ggplot(data = data_filling_by_day, aes(y=total_steps, x= date, fill = Weekday)) +
geom_col() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8, face = "bold"), legend.position = "top", panel.background = element_rect(fill = '#f1f1f1', colour = 'white')) +
scale_fill_manual(values = colors.pallete.2) +
ggtitle(label = "Total Steps by Day (after missing values are imputed)") +
ylab("Total Steps") +
xlab("Day") +
geom_hline(yintercept= data_filling_mean, linetype="solid", color = "orange", size=1.2) +
annotate(geom="text", x=8, y=21500, label=paste("Mean =", round(data_filling_mean, 2)), color="orange") +
geom_hline(yintercept= data_filling_median, linetype="solid", color = "blue", size=1) +
annotate(geom="text", x=8, y=22200, label=paste("Median =", round(data_filling_median, 2)), color="blue")+
geom_text(aes(x = date, y = total_steps, label = round(total_steps,0), angle = 90, size = 2, hjust = -0.1), color = "#5ea5dd", show.legend = FALSE)
```
## Are there differences in activity patterns between weekdays and weekends?
```{r message=FALSE,, error=FALSE, fig.width=11, fig.height= 8}
#' this function classify the date in weekend or weekday
#' @param date a date value
parse.day <- function(date){
weekend_days <- c("Sunday","Saturday")
if(weekdays(as.Date(date)) %in% weekend_days){
return("Weekend")
}else{
return("Weekday")
}
}
# applying the day_type function
data$day_type = sapply(data$date, parse.day)
# grouping by interval and date and summarize steps applying the mean
data_by_interval <- data %>% group_by(interval, day_type) %>% summarise(avg_steps = mean(steps, na.rm=TRUE))
# make plot
ggplot(data_by_interval, aes(x = interval, y = avg_steps, col = day_type)) +
geom_line() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8, face = "bold"), legend.position = "top", panel.background = element_rect(fill = '#f1f1f1', colour = 'white')) +
facet_grid(rows = vars(day_type)) +
ggtitle("Activity patterns between weekdays and weekends") +
theme(legend.position = "none") +
ylab("Average of steps") +
xlab("Interval")
```