casey jenkins Code, Technology & Data Science

Company Deep Dive

Deep-Dive.r

#load need libraries
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(geosphere)
## Loading required package: sp
library(leaflet)
library(ggplot2)
#import the data set
df <- read.csv("~/r-code/jumpmen/CompanyData.csv",stringsAsFactors = FALSE)
#lets take a look at the data frame
str(df)
## 'data.frame':	5983 obs. of  18 variables:
##  $ delivery_id                        : int  1457973 1377056 1476547 1485494 1327707 1423142 1334106 1311619 1487674 1417206 ...
##  $ customer_id                        : int  327168 64452 83095 271149 122609 75169 101347 59161 55375 153816 ...
##  $ jumpman_id                         : int  162381 104533 132725 157175 118095 91932 124897 79847 181543 157415 ...
##  $ vehicle_type                       : chr  "van" "bicycle" "bicycle" "bicycle" ...
##  $ pickup_place                       : chr  "Melt Shop" "Prince Street Pizza" "Bareburger" "Juice Press" ...
##  $ place_category                     : chr  "American" "Pizza" "Burger" "Juice Bar" ...
##  $ item_name                          : chr  "Lemonade" "Neapolitan Rice Balls" "Bare Sodas" "OMG! My Favorite Juice!" ...
##  $ item_quantity                      : int  1 3 1 1 2 1 1 2 NA 1 ...
##  $ item_category_name                 : chr  "Beverages" "Munchables" "Drinks" "Cold Pressed Juices" ...
##  $ how_long_it_took_to_order          : chr  "00:19:58.582052" "00:25:09.107093" "00:06:44.541717" "" ...
##  $ pickup_lat                         : num  40.7 40.7 40.7 40.7 40.7 ...
##  $ pickup_lon                         : num  -74 -74 -74 -74 -74 ...
##  $ dropoff_lat                        : num  40.8 40.7 40.7 40.8 40.7 ...
##  $ dropoff_lon                        : num  -74 -74 -74 -74 -74 ...
##  $ when_the_delivery_started          : chr  "2014-10-26 13:51:59.898924" "2014-10-16 21:58:58.65491" "2014-10-28 21:39:52.654394" "2014-10-30 10:54:11.531894" ...
##  $ when_the_Jumpman_arrived_at_pickup : chr  "" "2014-10-16 22:26:02.120931" "2014-10-28 21:37:18.793405" "2014-10-30 11:04:17.759577" ...
##  $ when_the_Jumpman_left_pickup       : chr  "" "2014-10-16 22:48:23.091253" "2014-10-28 21:59:09.98481" "2014-10-30 11:16:37.895816" ...
##  $ when_the_Jumpman_arrived_at_dropoff: chr  "2014-10-26 14:52:06.313088" "2014-10-16 22:59:22.948873" "2014-10-28 22:04:40.634962" "2014-10-30 11:32:38.090061" ...
#lets check the integrity of the data missing values, etc...
#is.na(df) # returns TRUE of data is missing

#looks like we have some item_quantity values missing
#the time stamps where imported as strings we need to convert them to a date format
df$when_the_delivery_started <- ymd_hms(substr(df$when_the_delivery_started,1,19))
## Warning in as.POSIXlt.POSIXct(x, tz): unknown timezone 'default/America/
## Denver'
df$when_the_Jumpman_arrived_at_pickup <- ymd_hms(substr(df$when_the_Jumpman_arrived_at_pickup,1,19))
df$when_the_Jumpman_left_pickup <- ymd_hms(substr(df$when_the_Jumpman_left_pickup,1,19))
df$when_the_Jumpman_arrived_at_dropoff <- ymd_hms(substr(df$when_the_Jumpman_arrived_at_dropoff,1,19))
#it good to code days like monday, tuesday, etc.. into integers so we can use them numerical data. We will create new columns and
#add the below values.
df$wday_delivery_started <- wday(df$when_the_delivery_started)
df$weekend_delivery_started <- ifelse(df$wday_delivery_started %in% c(1,7),1,0)
df$day_delivery_started <- (day(df$when_the_delivery_started))
#creat time differenc columns for the jumpman timestamps
df$delivery_time <- difftime(df$when_the_Jumpman_arrived_at_dropoff,
                             df$when_the_Jumpman_left_pickup,
                             units="hours")
df$loading_time <- difftime(df$when_the_Jumpman_left_pickup,
                            df$when_the_Jumpman_arrived_at_pickup,
                            units="hours")
df$jumpman_arrival_time <- difftime(df$when_the_Jumpman_arrived_at_pickup,
                            df$when_the_delivery_started,
                            units="hours")
#delivery distance based off the lat and long to meters
df$delivery_distance <- 0
for(i in 1:nrow(df))
{ 
  df[i,'delivery_distance'] <- distm(c(df[i,"dropoff_lat"],df[i,"dropoff_lon"]),
                                     c(df[i,"pickup_lat"],df[i,"pickup_lon"]),
                                     fun=distHaversine)/1609.34
}
#compute the average jumpman speed and put in new column
df$jumpman_avg_speed <- df$delivery_distance/as.numeric(df$delivery_time)
#calculate average jumpman speed to delivery
df$jumpman_avg_speed <- df$delivery_distance/as.numeric(df$delivery_time)

#lets retreive the distinct values and discard the rest
df_unique <- df %>% distinct(delivery_id, .keep_all = TRUE)
#weekend vs weekday by dropoffs
leaflet() %>% setView(-73.972887,40.732828,zoom=12) %>% addTiles() %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addCircleMarkers(data=subset(df_unique,weekend_delivery_started==0),
                   lat=~dropoff_lat,lng=~dropoff_lon,weight=1,radius=2,opacity=1,color="Green") %>%
  addCircleMarkers(data=subset(df_unique,weekend_delivery_started==1),
                   lat=~dropoff_lat,lng=~dropoff_lon,weight=1,radius=2,opacity=1,color="Red") %>%
  addLegend("bottomleft",colors =c("Red", "Green"),labels= c("Weekend","Weekday"),opacity = 1)
#weekend vs weekday by pickup
leaflet() %>% setView(-73.972887,40.732828,zoom=12) %>% addTiles() %>%
  addProviderTiles(providers$CartoDB.Positron) %>%
  addCircleMarkers(data=subset(df_unique,weekend_delivery_started==0),
                   lat=~pickup_lat,lng=~pickup_lon,weight=1,radius=3,opacity=1,color="Orange") %>%
  addCircleMarkers(data=subset(df_unique,weekend_delivery_started==1),
                   lat=~pickup_lat,lng=~pickup_lon,weight=1,radius=2,opacity=1,color="Blue")%>%
  addLegend("bottomright",colors =c("Blue", "Orange"),labels= c("Weekend","Weekday"),opacity = 1) 
#unique number of customers
paste(length(unique(df_unique$customer_id))," Unique Customers")
## [1] "3192  Unique Customers"
ggplot(data.frame(as.vector(table(df_unique$customer_id)))) +
  geom_histogram(bins=30,aes(x=as.vector.table.df_unique.customer_id..))+
  ggtitle("Customer Order Frequency - Histogram")+
  xlab("Orders/Customer")+
  ylab("Number of Customers") + theme_light()
#vehicle usage
ggplot(df_unique,aes(x=vehicle_type, 1,group=1)) +
  stat_summary(fun.y = sum,geom = "bar")+
  ggtitle("Number of Deliveries by Vehicle Type")+
  xlab("Vehicle Type")+ylab("Number of Deliveries") + theme_light()

#vechile usage by days of the week
ggplot(df_unique,aes(x=wday(when_the_delivery_started,label=T), 1,group=vehicle_type,color=vehicle_type)) +
  stat_summary(fun.y = sum,geom = "line",size=.5)+
  ggtitle("Vehicle usage by Days of the Week")+
  xlab("Days of the Week")+ylab("Number of Deliveries") + theme_light() 

#range of dates deliver start
paste("Dates range from ",
      min(df_unique$when_the_delivery_started),
      " to ",
      max(df_unique$when_the_delivery_started)
      )
## [1] "Dates range from  2014-10-01 00:07:58  to  2014-10-30 23:08:43"
#customers acquired
cust_acq <- df_unique %>%
  group_by(customer_id) %>%
  summarise(first_day=min(day(when_the_delivery_started)))

ggplot(cust_acq,aes(x=first_day,y=1)) +
  stat_summary(fun.y=sum,geom="line", colour= "blue") +
  ggtitle("Number of New Customers Acquired per Day")+
  ylab("Number of Customers")+
  xlab("Days in October 2014") + theme_light() 

#delivery trends
ggplot(df_unique,aes(x=wday(when_the_delivery_started,label=T), 1,group=1)) +
  stat_summary(fun.y = sum,geom = "line", colour= "blue")+
  ggtitle("Deliveries by Days of the Week")+
  ylab("Number of Deliveries")+xlab("Days of the Week") + theme_light() 

#delivery time by day of the week
ggplot(df_unique,aes(x=vehicle_type,y=delivery_time))+
  geom_tile()+
  ggtitle("Delivery Time variation across Vehicle Types")+
  xlab("Vehicle Types")+ylab("Delivery Time (Hours)") + theme_light() 
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## Warning: Removed 495 rows containing missing values (geom_tile).

#jumpman arrival by day of the week
ggplot(df_unique,aes(x=wday(when_the_delivery_started,label=T),y=delivery_time))+
  geom_tile()+
  ggtitle("Loading Time variation across Days of the Week")+
  xlab("Days of the Week")+ylab("Delivery Time") + theme_light()
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## Warning: Removed 495 rows containing missing values (geom_tile).

ggplot(df_unique,aes(x=vehicle_type,y=loading_time))+
  geom_tile()+
  ggtitle("Loading Time variation across Vehicle Types")+
  xlab("Vehicle Types")+ylab("Loading Time (Hours)") + theme_light()
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## Warning: Removed 495 rows containing missing values (geom_tile).

ggplot(df_unique,aes(x=vehicle_type,y=delivery_distance))+
  geom_boxplot()+
  ggtitle("Delivery Distances by Vehicle Type")+
  xlab("Vehicle Type")+
  ylab("Distance (Miles)") + theme_light()

ggplot(df_unique,aes(x=vehicle_type,y=jumpman_avg_speed))+
  geom_boxplot()+
  ggtitle("Average Delivery Speed by Vehicle Type")+
  xlab("Vehicle Type")+
  ylab("Speed (MPH)") + theme_light()
## Warning: Removed 495 rows containing non-finite values (stat_boxplot).