A course project in the R programming for data science nanodegree. For this project, the goal is to ask and answer three questions about the available bikeshare data from Washington, Chicago, and New York.
Bicycle-sharing systems allow users to rent bicycles on a very short-term basis for a price. This allows people to borrow a bike from point A and return it at point B, though they can also return it to the same location if they'd like to just go for a ride. Regardless, each bike can serve several users per day.
This project requires a conda environment that has R installed.
git clone https://github.com/maqui7295/explore-bikeshare-data
cd explore-bikeshare-data
jupyter notebook
head(ny)
nrow(ny)
X | Start.Time | End.Time | Trip.Duration | Start.Station | End.Station | User.Type | Gender | Birth.Year |
---|---|---|---|---|---|---|---|---|
5688089 | 2017-06-11 14:55:05 | 2017-06-11 15:08:21 | 795 | Suffolk St & Stanton St | W Broadway & Spring St | Subscriber | Male | 1998 |
4096714 | 2017-05-11 15:30:11 | 2017-05-11 15:41:43 | 692 | Lexington Ave & E 63 St | 1 Ave & E 78 St | Subscriber | Male | 1981 |
2173887 | 2017-03-29 13:26:26 | 2017-03-29 13:48:31 | 1325 | 1 Pl & Clinton St | Henry St & Degraw St | Subscriber | Male | 1987 |
3945638 | 2017-05-08 19:47:18 | 2017-05-08 19:59:01 | 703 | Barrow St & Hudson St | W 20 St & 8 Ave | Subscriber | Female | 1986 |
6208972 | 2017-06-21 07:49:16 | 2017-06-21 07:54:46 | 329 | 1 Ave & E 44 St | E 53 St & 3 Ave | Subscriber | Male | 1992 |
1285652 | 2017-02-22 18:55:24 | 2017-02-22 19:12:03 | 998 | State St & Smith St | Bond St & Fulton St | Subscriber | Male | 1986 |
54770
What is the most common start station in each city?
# helper function
getMostCommonSS <- function(x) {
# The summary function when applied to a factor variable gives the count (in descending order)
# of each category (level) in the factor
ss <- summary(as.factor(x))
return(ss)
}
printMostCommon <- function(ss, city) {
res <- paste("Most common start station in", city, "is", names(head(ss, 1)), "with", head(ss, 1), "rides.")
return(res)
}
# Get the most common start station in each city
NY.SS <- getMostCommonSS(ny$Start.Station)
wash.SS <- getMostCommonSS(wash$Start.Station)
chi.SS <- getMostCommonSS(chi$Start.Station)
printMostCommon(NY.SS, "New York")
printMostCommon(wash.SS, "Washington")
printMostCommon(chi.SS, "Chicago")
'Most common start station in New York is Pershing Square North with 592 rides.'
'Most common start station in Washington is Columbus Circle / Union Station with 1700 rides.'
'Most common start station in Chicago is Streeter Dr & Grand Ave with 210 rides.'
library(ggplot2)
# a helper function that plots the top N start stations in each city
# SS is a named vector that holds the count of each start stations
plotTopN <- function(SS, city, n) {
# convert the named integers to a dataframe
df <- data.frame(station=names(SS), SS)
# remove the rownames generated by the data.frame function
rownames(df) <- NULL
ggplot(df[1:n, ],
aes(x=reorder(station, SS), y=SS, label=SS, fill=station, color=station)) +
geom_col() +
# push the data labels to the top of the bars and make them black
geom_text(nudge_y = 1, color="black") +
labs(title = paste("Top", n, "start stations in", city),
x="", y="count") + theme_bw() +
# rotate the axis text by 90 degrees; also make the plot horizontal
theme(axis.text.x = element_text(angle = 90), legend.position = "none") +
coord_flip()
}
# print the top start stations in New York city
plotTopN(NY.SS, "New York", 10)
What is the most common ride month in each city?
# A function that separate the Start.Time column of a dataframe.
# The generated column includes start_date, start_month_name (e.g. June), start_month (e.g. 06), start_day and start_year
addStartMonth <- function(df) {
#convert to date (format: YYYY-MM-DD)
df$start_date <- as.Date(df$Start.Time)
# get the full name of the month e.g July
df$start_month_name <- months(df$start_date, FALSE)
# replace the year and month part of the date with an empty string
df$start_day <- sub("\\d{4}-\\d{2}-", "", df$start_date)
# remove the month and day part
df$start_year <- sub("-\\d{2}-.*", "", df$start_date)
# remove the year and day parts
df$start_month <- sub("-\\d{2}", "", sub("\\d{4}-", "", df$start_date))
return(df)
}
ny2 <- addStartMonth(ny)
wash2 <- addStartMonth(wash)
chi2 <- addStartMonth(chi)
levels <- c("January", "February", "March", "April", "May", "June")
# get the counts of the ride months in each city
NY.month_counts <- summary(factor(ny2$start_month_name, levels=levels))
wash.month_counts <- summary(factor(wash2$start_month_name, levels=levels))
chi.month_counts <- summary(factor(chi2$start_month_name, levels=levels))
# display the results
# NY.month_counts
# wash.month_counts # the NA does not matter in this case
# chi.month_counts
# Creating a plot to compare the months side by side
# convert the counts to a dataframe
df_ny_mnth <- data.frame(month=names(NY.month_counts), count=NY.month_counts, city="New York")
df_wash_mnth <- data.frame(month=names(wash.month_counts), count=wash.month_counts, city="Washington")
df_chi_mnth <- data.frame(month=names(chi.month_counts), count=chi.month_counts, city="Chicago")
# combine all to a single dataset
df_all_mnth <- rbind(df_ny_mnth, df_wash_mnth, df_chi_mnth)
# remove the rownames of the dataset
rownames(df_all_mnth) <- NULL
# create the plot
ggplot(df_all_mnth, aes(x=month, y=count, fill=city, label=count)) +
# put the bars side by side
geom_col(position = position_dodge()) +
# tilt the data labels to angle of 45 degrees to avoid overlapping
geom_text(position = position_dodge(width = 1), size=3.5, angle=45) +
theme_bw() +
ggtitle("The ride months in each city",
subtitle = "June is the most common month in each city")
In each city, most of the bike rides occurred in June.