Analyze how annual members and casual riders use Cyclistic bikes
differently.
The goal is to support the marketing team in designing a
data-driven strategy to convert casual riders
into annual members.
Data set Link: Divvy Trip Data
Criteria Evaluation Reliable Yes – from Divvy/Motivate Original Yes – official open data Comprehensive Yes – includes trips, timestamps, user info Current Covers Q1 2019 & Q1 2020 (limitation due to storage) Cited Provided by Google in the capstone project —
library(tidyverse)
library(lubridate)
df_2019 <- read.csv("/Users/mostafaessam/Desktop/portofolio/Cylistic_Bike_share/rstudio-export/Divvy_Trips_2019_Q1.csv")
df_2020 <- read.csv("/Users/mostafaessam/Desktop/portofolio/Cylistic_Bike_share/rstudio-export/Divvy_Trips_2020_Q1.csv")
colnames(df_2019)
## [1] "trip_id" "start_time" "end_time"
## [4] "bikeid" "tripduration" "from_station_id"
## [7] "from_station_name" "to_station_id" "to_station_name"
## [10] "usertype" "gender" "birthyear"
colnames(df_2020)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
head(df_2019, 10)
## trip_id start_time end_time bikeid tripduration
## 1 21742443 2019-01-01 00:04:37 2019-01-01 00:11:07 2167 390.0
## 2 21742444 2019-01-01 00:08:13 2019-01-01 00:15:34 4386 441.0
## 3 21742445 2019-01-01 00:13:23 2019-01-01 00:27:12 1524 829.0
## 4 21742446 2019-01-01 00:13:45 2019-01-01 00:43:28 252 1,783.0
## 5 21742447 2019-01-01 00:14:52 2019-01-01 00:20:56 1170 364.0
## 6 21742448 2019-01-01 00:15:33 2019-01-01 00:19:09 2437 216.0
## 7 21742449 2019-01-01 00:16:06 2019-01-01 00:19:03 2708 177.0
## 8 21742450 2019-01-01 00:18:41 2019-01-01 00:20:21 2796 100.0
## 9 21742451 2019-01-01 00:18:43 2019-01-01 00:47:30 6205 1,727.0
## 10 21742452 2019-01-01 00:19:18 2019-01-01 00:24:54 3939 336.0
## from_station_id from_station_name to_station_id
## 1 199 Wabash Ave & Grand Ave 84
## 2 44 State St & Randolph St 624
## 3 15 Racine Ave & 18th St 644
## 4 123 California Ave & Milwaukee Ave 176
## 5 173 Mies van der Rohe Way & Chicago Ave 35
## 6 98 LaSalle St & Washington St 49
## 7 98 LaSalle St & Washington St 49
## 8 211 St. Clair St & Erie St 142
## 9 150 Fort Dearborn Dr & 31st St 148
## 10 268 Lake Shore Dr & North Blvd 141
## to_station_name usertype gender birthyear
## 1 Milwaukee Ave & Grand Ave Subscriber Male 1989
## 2 Dearborn St & Van Buren St (*) Subscriber Female 1990
## 3 Western Ave & Fillmore St (*) Subscriber Female 1994
## 4 Clark St & Elm St Subscriber Male 1993
## 5 Streeter Dr & Grand Ave Subscriber Male 1994
## 6 Dearborn St & Monroe St Subscriber Female 1983
## 7 Dearborn St & Monroe St Subscriber Male 1984
## 8 McClurg Ct & Erie St Subscriber Male 1990
## 9 State St & 33rd St Subscriber Male 1995
## 10 Clark St & Lincoln Ave Subscriber Male 1996
head(df_2020, 10)
## ride_id rideable_type started_at ended_at
## 1 EACB19130B0CDA4A docked_bike 2020-01-21 20:06:59 2020-01-21 20:14:30
## 2 8FED874C809DC021 docked_bike 2020-01-30 14:22:39 2020-01-30 14:26:22
## 3 789F3C21E472CA96 docked_bike 2020-01-09 19:29:26 2020-01-09 19:32:17
## 4 C9A388DAC6ABF313 docked_bike 2020-01-06 16:17:07 2020-01-06 16:25:56
## 5 943BC3CBECCFD662 docked_bike 2020-01-30 08:37:16 2020-01-30 08:42:48
## 6 6D9C8A6938165C11 docked_bike 2020-01-10 12:33:05 2020-01-10 12:37:54
## 7 31EB9B8F406D4C82 docked_bike 2020-01-10 13:07:35 2020-01-10 13:12:24
## 8 A2B24E3F9C9720E3 docked_bike 2020-01-10 07:24:53 2020-01-10 07:29:50
## 9 5E3F01E1441730B7 docked_bike 2020-01-31 16:37:16 2020-01-31 16:42:11
## 10 19DC57F7E3140131 docked_bike 2020-01-31 09:39:17 2020-01-31 09:42:40
## start_station_name start_station_id
## 1 Western Ave & Leland Ave 239
## 2 Clark St & Montrose Ave 234
## 3 Broadway & Belmont Ave 296
## 4 Clark St & Randolph St 51
## 5 Clinton St & Lake St 66
## 6 Wells St & Hubbard St 212
## 7 Desplaines St & Randolph St 96
## 8 Desplaines St & Randolph St 96
## 9 Wells St & Hubbard St 212
## 10 Clark St & Lake St 38
## end_station_name end_station_id start_lat start_lng
## 1 Clark St & Leland Ave 326 41.9665 -87.6884
## 2 Southport Ave & Irving Park Rd 318 41.9616 -87.6660
## 3 Wilton Ave & Belmont Ave 117 41.9401 -87.6455
## 4 Fairbanks Ct & Grand Ave 24 41.8846 -87.6319
## 5 Wells St & Hubbard St 212 41.8856 -87.6418
## 6 Desplaines St & Randolph St 96 41.8899 -87.6343
## 7 Wells St & Hubbard St 212 41.8846 -87.6446
## 8 Wells St & Hubbard St 212 41.8846 -87.6446
## 9 Desplaines St & Randolph St 96 41.8899 -87.6343
## 10 Orleans St & Merchandise Mart Plaza 100 41.8860 -87.6309
## end_lat end_lng member_casual
## 1 41.9671 -87.6674 member
## 2 41.9542 -87.6644 member
## 3 41.9402 -87.6530 member
## 4 41.8918 -87.6206 member
## 5 41.8899 -87.6343 member
## 6 41.8846 -87.6446 member
## 7 41.8899 -87.6343 member
## 8 41.8899 -87.6343 member
## 9 41.8846 -87.6446 member
## 10 41.8882 -87.6364 member
df_2019_clean <- df_2019 %>%
rename(
ride_id = trip_id,
started_at = start_time,
ended_at = end_time,
start_station_id = from_station_id,
start_station_name = from_station_name,
end_station_name = to_station_name,
end_station_id = to_station_id
) %>%
mutate(
member_casual = case_when(
usertype == "Subscriber" ~ "member",
usertype == "Customer" ~ "casual"
),
rideable_type = NA_character_,
start_lat = NA_real_,
start_lng = NA_real_,
end_lat = NA_real_,
end_lng = NA_real_
) %>%
select(-tripduration, -bikeid, -gender, -birthyear) %>%
select(
ride_id, rideable_type, started_at, ended_at,
start_station_name, start_station_id,
end_station_name, end_station_id,
start_lat, start_lng, end_lat, end_lng,
member_casual
)
df_2019_clean <- df_2019_clean %>%
mutate(
ride_id = as.character(ride_id),
rideable_type = as.character(rideable_type),
start_station_id = as.character(start_station_id),
end_station_id = as.character(end_station_id),
start_lat = as.numeric(start_lat),
start_lng = as.numeric(start_lng),
end_lat = as.numeric(end_lat),
end_lng = as.numeric(end_lng)
)
df_2020 <- df_2020 %>%
mutate(
ride_id = as.character(ride_id),
rideable_type = as.character(rideable_type),
start_station_name = as.character(start_station_name),
start_station_id = as.character(start_station_id),
end_station_name = as.character(end_station_name),
end_station_id = as.character(end_station_id),
start_lat = as.numeric(start_lat),
start_lng = as.numeric(start_lng),
end_lat = as.numeric(end_lat),
end_lng = as.numeric(end_lng),
member_casual = as.character(member_casual)
)
df_2019_clean <- df_2019_clean %>%
mutate(
started_at = ymd_hms(started_at),
ended_at = ymd_hms(ended_at)
)
df_2020 <- df_2020 %>%
mutate(
started_at = ymd_hms(started_at),
ended_at = ymd_hms(ended_at)
)
summary(df_2019_clean)
## ride_id rideable_type started_at
## Length:365069 Length:365069 Min. :2019-01-01 00:04:37
## Class :character Class :character 1st Qu.:2019-01-23 05:26:54
## Mode :character Mode :character Median :2019-02-25 07:52:56
## Mean :2019-02-19 21:43:15
## 3rd Qu.:2019-03-17 16:52:47
## Max. :2019-03-31 23:53:48
##
## ended_at start_station_name start_station_id
## Min. :2019-01-01 00:11:07 Length:365069 Length:365069
## 1st Qu.:2019-01-23 05:49:40 Class :character Class :character
## Median :2019-02-25 08:03:50 Mode :character Mode :character
## Mean :2019-02-19 22:00:11
## 3rd Qu.:2019-03-17 17:16:16
## Max. :2019-06-17 16:04:35
##
## end_station_name end_station_id start_lat start_lng
## Length:365069 Length:365069 Min. : NA Min. : NA
## Class :character Class :character 1st Qu.: NA 1st Qu.: NA
## Mode :character Mode :character Median : NA Median : NA
## Mean :NaN Mean :NaN
## 3rd Qu.: NA 3rd Qu.: NA
## Max. : NA Max. : NA
## NA's :365069 NA's :365069
## end_lat end_lng member_casual
## Min. : NA Min. : NA Length:365069
## 1st Qu.: NA 1st Qu.: NA Class :character
## Median : NA Median : NA Mode :character
## Mean :NaN Mean :NaN
## 3rd Qu.: NA 3rd Qu.: NA
## Max. : NA Max. : NA
## NA's :365069 NA's :365069
summary(df_2020)
## ride_id rideable_type started_at
## Length:426887 Length:426887 Min. :2020-01-01 00:04:44
## Class :character Class :character 1st Qu.:2020-01-24 14:03:26
## Mode :character Mode :character Median :2020-02-17 05:01:27
## Mean :2020-02-14 01:23:18
## 3rd Qu.:2020-03-05 15:08:13
## Max. :2020-03-31 23:51:34
##
## ended_at start_station_name start_station_id
## Min. :2020-01-01 00:10:54 Length:426887 Length:426887
## 1st Qu.:2020-01-24 14:21:24 Class :character Class :character
## Median :2020-02-17 05:48:58 Mode :character Mode :character
## Mean :2020-02-14 01:45:25
## 3rd Qu.:2020-03-05 15:27:54
## Max. :2020-05-19 20:10:34
##
## end_station_name end_station_id start_lat start_lng
## Length:426887 Length:426887 Min. :41.74 Min. :-87.77
## Class :character Class :character 1st Qu.:41.88 1st Qu.:-87.66
## Mode :character Mode :character Median :41.89 Median :-87.64
## Mean :41.90 Mean :-87.64
## 3rd Qu.:41.92 3rd Qu.:-87.63
## Max. :42.06 Max. :-87.55
##
## end_lat end_lng member_casual
## Min. :41.74 Min. :-87.77 Length:426887
## 1st Qu.:41.88 1st Qu.:-87.66 Class :character
## Median :41.89 Median :-87.64 Mode :character
## Mean :41.90 Mean :-87.64
## 3rd Qu.:41.92 3rd Qu.:-87.63
## Max. :42.06 Max. :-87.55
## NA's :1 NA's :1
analysis_df <- bind_rows(df_2019_clean, df_2020)
colSums(is.na(analysis_df))
## ride_id rideable_type started_at ended_at
## 0 365069 0 0
## start_station_name start_station_id end_station_name end_station_id
## 0 0 0 1
## start_lat start_lng end_lat end_lng
## 365069 365069 365070 365070
## member_casual
## 0
colSums(is.na(analysis_df)) / nrow(analysis_df) * 100
## ride_id rideable_type started_at ended_at
## 0.000000e+00 4.609713e+01 0.000000e+00 0.000000e+00
## start_station_name start_station_id end_station_name end_station_id
## 0.000000e+00 0.000000e+00 0.000000e+00 1.262696e-04
## start_lat start_lng end_lat end_lng
## 4.609713e+01 4.609713e+01 4.609726e+01 4.609726e+01
## member_casual
## 0.000000e+00
sapply(analysis_df[, sapply(analysis_df, is.character)], function(x) sum(x == "", na.rm = TRUE))
## ride_id rideable_type start_station_name start_station_id
## 0 0 0 0
## end_station_name end_station_id member_casual
## 1 0 0
analysis_df <- analysis_df %>% mutate(year = year(started_at))
table(is.na(analysis_df$rideable_type), analysis_df$year)
##
## 2019 2020
## FALSE 0 426887
## TRUE 365069 0
analysis_df <- analysis_df %>% filter(!is.na(end_station_name))
sum(duplicated(analysis_df))
## [1] 0
summary(analysis_df)
## ride_id rideable_type started_at
## Length:791956 Length:791956 Min. :2019-01-01 00:04:37
## Class :character Class :character 1st Qu.:2019-02-28 17:04:04
## Mode :character Mode :character Median :2020-01-07 12:48:50
## Mean :2019-09-01 11:58:08
## 3rd Qu.:2020-02-19 19:31:54
## Max. :2020-03-31 23:51:34
##
## ended_at start_station_name start_station_id
## Min. :2019-01-01 00:11:07 Length:791956 Length:791956
## 1st Qu.:2019-02-28 17:15:58 Class :character Class :character
## Median :2020-01-07 13:02:50 Mode :character Mode :character
## Mean :2019-09-01 12:17:52
## 3rd Qu.:2020-02-19 19:51:54
## Max. :2020-05-19 20:10:34
##
## end_station_name end_station_id start_lat start_lng
## Length:791956 Length:791956 Min. :41.74 Min. :-87.77
## Class :character Class :character 1st Qu.:41.88 1st Qu.:-87.66
## Mode :character Mode :character Median :41.89 Median :-87.64
## Mean :41.90 Mean :-87.64
## 3rd Qu.:41.92 3rd Qu.:-87.63
## Max. :42.06 Max. :-87.55
## NA's :365069 NA's :365069
## end_lat end_lng member_casual year
## Min. :41.74 Min. :-87.77 Length:791956 Min. :2019
## 1st Qu.:41.88 1st Qu.:-87.66 Class :character 1st Qu.:2019
## Median :41.89 Median :-87.64 Mode :character Median :2020
## Mean :41.90 Mean :-87.64 Mean :2020
## 3rd Qu.:41.92 3rd Qu.:-87.63 3rd Qu.:2020
## Max. :42.06 Max. :-87.55 Max. :2020
## NA's :365070 NA's :365070
str(analysis_df)
## 'data.frame': 791956 obs. of 14 variables:
## $ ride_id : chr "21742443" "21742444" "21742445" "21742446" ...
## $ rideable_type : chr NA NA NA NA ...
## $ started_at : POSIXct, format: "2019-01-01 00:04:37" "2019-01-01 00:08:13" ...
## $ ended_at : POSIXct, format: "2019-01-01 00:11:07" "2019-01-01 00:15:34" ...
## $ start_station_name: chr "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
## $ start_station_id : chr "199" "44" "15" "123" ...
## $ end_station_name : chr "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
## $ end_station_id : chr "84" "624" "644" "176" ...
## $ start_lat : num NA NA NA NA NA NA NA NA NA NA ...
## $ start_lng : num NA NA NA NA NA NA NA NA NA NA ...
## $ end_lat : num NA NA NA NA NA NA NA NA NA NA ...
## $ end_lng : num NA NA NA NA NA NA NA NA NA NA ...
## $ member_casual : chr "member" "member" "member" "member" ...
## $ year : num 2019 2019 2019 2019 2019 ...
table(analysis_df$member_casual)
##
## casual member
## 71643 720313
analysis_df <- analysis_df %>%
mutate(day_of_week = weekdays(started_at))
analysis_df <- analysis_df %>%
mutate(ride_length = as.numeric(difftime(ended_at, started_at, units = "mins"))) %>%
filter(ride_length > 0 & ride_length < 1440)
ride_summary <- analysis_df %>%
group_by(member_casual) %>%
summarise(
count = n(),
mean_duration = mean(ride_length),
median_duration = median(ride_length),
min_duration = min(ride_length),
max_duration = max(ride_length),
sd_duration = sd(ride_length)
)
ride_summary
## # A tibble: 2 × 7
## member_casual count mean_duration median_duration min_duration max_duration
## <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 casual 71138 36.5 22.0 0.0167 1436.
## 2 member 720126 11.4 8.47 0.0167 1433.
## # ℹ 1 more variable: sd_duration <dbl>
ggplot(analysis_df, aes(x = member_casual, y = ride_length, fill = member_casual)) +
geom_boxplot() +
coord_cartesian(ylim = c(0, 60)) +
labs(
title = "Ride Duration by User Type",
x = "User Type",
y = "Ride Duration (minutes)",
fill = "User Type"
) +
theme_minimal()
analysis_2020 <- analysis_df %>% filter(year == 2020) %>%
mutate(day_of_week = factor(weekdays(started_at),
levels = c("Monday", "Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday", "Sunday")))
day_usage <- analysis_2020 %>%
group_by(member_casual, day_of_week) %>%
summarise(total_rides = n()) %>%
arrange(member_casual, desc(total_rides))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
day_usage
## # A tibble: 14 × 3
## # Groups: member_casual [2]
## member_casual day_of_week total_rides
## <chr> <fct> <int>
## 1 casual Sunday 14820
## 2 casual Saturday 7445
## 3 casual Wednesday 5846
## 4 casual Tuesday 5225
## 5 casual Friday 5103
## 6 casual Thursday 4819
## 7 casual Monday 4785
## 8 member Tuesday 69692
## 9 member Wednesday 63969
## 10 member Monday 61916
## 11 member Thursday 61236
## 12 member Friday 55477
## 13 member Sunday 35959
## 14 member Saturday 30095
ggplot(analysis_2020, aes(x = day_of_week, fill = member_casual)) +
geom_bar(position = "dodge") +
labs(
title = "Number of Rides by Day of Week",
x = "Day of Week",
y = "Number of Rides",
fill = "User Type"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Based on our findings, we recommend the following strategies to help convert casual riders into annual members:
These actions can help Cyclistic capitalize on casual rider behavior and grow its member base.