Ford GoBike Data from 2018/01 through 2019/07. Dataset was downloaded from: https://www.fordgobike.com/system-data. This data encompasses bike ride start and end date, station information and location, member type of riders, gender, and age.
our main age groups are between 20-60 we will want to cut off all users over 60 and focus on breaking our ages into bins
# age distrib with boxplot
plt.figure(figsize=(10,4))
sb.boxplot(x='member_age', data=tbikedata, palette='Blues', orient='h')
plt.title("Age Distribution", fontsize=16, y=1)
plt.xlabel("member_age", fontsize=12, labelpad=15)
plt.ylabel("counts", fontsize=12, labelpad=15);
We will look at the bike ride trends in terms of
Most users were in the buckets of 20-30 and 30-40
#separate DF to look into and analyze age groups/bins-
age_df = tbikedata.groupby('member_age_bins').agg({'bike_id': 'count'})
#percentage
age_df['perc'] = (age_df['bike_id'] / age_df['bike_id'].sum())*100
#plot
age_df['perc'].plot(kind='bar', figsize=(8,5))
plt.title('Percentage of Rides by Age Groups', fontsize=20, y=1)
plt.xlabel('Member Age Group', labelpad=15)
plt.ylabel('Percentage % ', labelpad=15)
plt.xticks(rotation=360);
There are more M (male) users than F (females)
#separate DF to look into and analyze between genders-
gender_df = tbikedata.groupby('member_gender').agg({'bike_id':'count'})
#percentage
gender_df['perc'] = (gender_df['bike_id'] / gender_df['bike_id'].sum())*100
# plot
gender_df['perc'].plot(kind='barh', figsize=(8,5))
plt.title('Percentage of Bike Rides by Gender', fontsize=16, y=1)
plt.xlabel('Percentage %', labelpad=15)
plt.ylabel('Gender', labelpad=15)
plt.xticks(rotation=360)
plt.xlim(0,100);
The peak time for bike rides were between 7AM-9AM and 4PM- 7PM (16-19)
#separate DF to look into and analyze hours-
hour_df = tbikedata.groupby('st_hour').agg({'bike_id':'count'}).reset_index()
#percentage
hour_df['perc'] = (hour_df['bike_id'] / hour_df['bike_id'].sum())*100
# plot
plt.figure(figsize=(8,5))
sb.pointplot(data=hour_df, x='st_hour', y='perc', scale=0.6)
plt.title('Percentage of Bike Rides By Hour of the Day', fontsize=16, y=1)
plt.xlabel('Hour of the Day', labelpad=15)
plt.ylabel('Percentage % ', labelpad=15)
plt.xticks(rotation=360);
Rides were fairly even throughout the weekday and dropped off in the weekends
#separate DF to look into and analyze between genders-
weekday_df = tbikedata.groupby('st_weekday').agg({'bike_id':'count'})
#percentage:
weekday_df['perc'] = (weekday_df['bike_id'] / weekday_df['bike_id'].sum())*100
#format of plot
base_color =sb.color_palette()[4]
day_order= ['Monday', 'Tuesday', 'Wednesday','Thursday','Friday','Saturday', 'Sunday']
# plot
weekday_df.reindex(day_order)['perc'].plot(kind='bar', color=base_color, figsize=(10,5))
plt.title('Percentage of Bike Rides on the Weekdays', fontsize=16, y=1)
plt.xlabel('Weekday', labelpad=15)
plt.ylabel('Percentage %', labelpad=15)
plt.xticks(rotation=360);
We will look at the bike ride trends in terms of
20-30 and 30-40 were the largest age bins for suscribers
# SUBSCRIBER DF for calculating bike-ride counts by age group
subscriber_df = tbikedata[tbikedata['user_type'] == 'Subscriber'].groupby(['YYMM', 'member_age_bins']).agg({'bike_id':'count'}).reset_index()
# Create a data frame for calculating bike-ride counts of customers per age group over year-month.
customer_df = tbikedata[tbikedata['user_type'] == 'Customer'].groupby(['YYMM', 'member_age_bins']).agg({'bike_id':'count'}).reset_index()
# plot- trend of bike rides for subscribers
plt.figure(figsize=(12,5))
ax = sb.pointplot(data=subscriber_df, x='YYMM', y='bike_id', scale=0.4, hue='member_age_bins')
plt.title("Monthly Trend of Rides by Subscribers' Age Group", fontsize=16, y=1)
plt.xlabel('YY-MM', labelpad=15)
plt.ylabel('Bike Rides', labelpad=15)
plt.xticks(rotation=360)
legend = ax.legend()
legend.set_title('Member Age Group');
20-30 and 30-40 were the largest age bins for customers as well
# plot- trend of bike rides for customers
plt.figure(figsize=(12,5))
ax = sb.pointplot(data=customer_df, x='YYMM', y='bike_id', scale=0.4, hue='member_age_bins')
plt.title("Monthly Trend of Rides by Customers' Age Group", fontsize=16, y=1)
plt.xlabel('year-month', labelpad=15)
plt.ylabel('Bike Rides', labelpad=15)
plt.xticks(rotation=360)
legend = ax.legend()
legend.set_title('Member Age Group');
However, customers took longer rides than subscribers
tbikedata.groupby('user_type')['duration_min'].mean().plot(kind='barh', figsize=(8,5))
plt.title('Avg Trip Duration by User Type', fontsize=16, y=1)
plt.xlabel('Avg Trip Duration (mins)', labelpad=15)
plt.ylabel('User Type', labelpad=15)
plt.xticks(rotation=360);
The width of the violinplots indicates numbers of rides, the wider it is, the more rides it is associated with, moving up the durations by minutes on the y-axis
Subscribers took shorter rides overall, but took more rides when compared to customers
# DF for user_type and duration = user_type_duration_df
user_type_duration_df = tbikedata.loc[:,['user_type', 'duration_min']]
# plot -
user_type_duration_df_60 = user_type_duration_df[user_type_duration_df['duration_min'] <= 60]
sb.violinplot(data=user_type_duration_df_60, x='user_type', y='duration_min');
# Add title and format it
plt.title('Distribution of Trip Durations by User Type'.title(),
fontsize = 14, weight = "bold")
# Add x label and format it
plt.xlabel('User Types'.title(),
fontsize = 12, weight = "bold")
# Add y label and format it
plt.ylabel('Duration in Minutes'.title(),
fontsize = 12, weight = "bold");
Subscribers showed growth and more peaks/dips while being much higher than Customers Customers remained mostly steady throughout the months/year
# plot -
plt.figure(figsize=(12,4))
palette = {'Subscriber': 'green', 'Customer': 'blue'}
ax = sb.pointplot(data= n_user_type_YYMM, x='YYMM', y=0, hue='user_type', palette=palette, scale=0.3)
plt.title('Monthly Trend of Rides by User Type', fontsize=16, y=1)
plt.xlabel('YYMM', labelpad=15)
plt.ylabel('Rides', labelpad=15)
legend = ax.legend()
legend.set_title('User Type');
# visualize using heatmaps; should produce 4 maps
plt.figure(figsize=(14.70,10.27))
plt.subplot(221)
plt.suptitle('age group, weekdays, hrs/day and bike rides', fontsize=16, y=1)
sb.heatmap(sub_1_pivot, fmt='d', annot=True, cmap='YlGnBu', annot_kws={'size': 4})
plt.title('20-30 yr old subscribers')
plt.xlabel('Hour of the Day', labelpad=5)
plt.ylabel('Day of the Week', labelpad=10)
plt.yticks(rotation=360)
plt.subplot(222)
sb.heatmap(sub_2_pivot, fmt='d', annot=True, cmap='YlGnBu', annot_kws={'size': 4})
plt.title('30-40 yr old subscribers')
plt.xlabel('Hour of the Day', labelpad=5)
plt.ylabel('Day of the Week', labelpad=10)
plt.yticks(rotation=360)
plt.subplot(223)
sb.heatmap(sub_3_pivot, fmt='d', annot=True, cmap='YlGnBu', annot_kws={'size': 4})
plt.title('40-50 yr old subscribers')
plt.xlabel('Hour of the Day', labelpad=5)
plt.ylabel('Day of the Week', labelpad=10)
plt.yticks(rotation=360)
plt.subplot(224)
sb.heatmap(sub_4_pivot, fmt='d', annot=True, cmap='YlGnBu', annot_kws={'size': 4})
plt.title('50-60 yr old subscribers')
plt.xlabel('Hour of the Day', labelpad=5)
plt.ylabel('Day of the Week', labelpad=10)
plt.yticks(rotation=360);