0

I'm working on the Google Capstone Project (cyclistic, analysis of bike share data) part of the Google Data Analytics course on Coursera.

I'm having issues converting two variables to date format.

The public data are located here

I've downloaded the data from 202109 to 202208.

I am trying to convert the "started_at" and "ended_at" variables to date format, so that I can calculate durations and other measures. This is what I have so far:

library(readxl)
library(ggplot2)
library(dplyr)
library(psych)
library(tidyr)
library(chron)
library(tidyverse)

library(lubridate) #for date functions , helps wrangle data attributes
library(data.table)
library(janitor)
###################################################################################################
###                          CLEAR WORKSPACE AND SET DIRECTORY
###################################################################################################

rm(list = ls())
rm(list = ls(all.names = TRUE)) #will clear all objects includes hidden objects.
setwd("[...folder path containing the .csv files...]")
getwd()

dir() # Show all files in the working directory

###################################################################################################
###                           Set some useful functions
###################################################################################################
detach_package = function(pkg, character.only = FALSE)
{
  if(!character.only)
  {
    pkg = deparse(substitute(pkg))
  }
  search_item = paste("package", pkg, sep = ":")
  while(search_item %in% search())
  {
    detach(search_item, unload = TRUE, character.only = TRUE)
  }
}

#  Standard Error Function
stdErr = function(x) sd(x, na.rm = TRUE)/sqrt(length(x))

###################################################################################################
###                          LOAD and combine DATA
###################################################################################################
# I am loading all the .csv files individually to separate data.frames
data_01 = read.csv("202109-divvy-tripdata.csv")
data_02 = read.csv("202110-divvy-tripdata.csv")
data_03 = read.csv("202111-divvy-tripdata.csv")
data_04 = read.csv("202112-divvy-tripdata.csv")
data_05 = read.csv("202201-divvy-tripdata.csv")
data_06 = read.csv("202202-divvy-tripdata.csv")
data_07 = read.csv("202203-divvy-tripdata.csv")
data_08 = read.csv("202204-divvy-tripdata.csv")
data_09 = read.csv("202205-divvy-tripdata.csv")
data_010 = read.csv("202206-divvy-tripdata.csv")
data_011 = read.csv("202207-divvy-tripdata.csv")
data_012 = read.csv("202208-divvy-tripdata.csv")


# Combine all in one data.frame
totDb = rbind(data_01, data_02, data_03, data_04, data_05, 
                  data_06, data_07, data_08, data_09, data_010,
                  data_011, data_012)

# Remove the individual datasets
rm(data_01, data_02, data_03, data_04, data_05, 
   data_06, data_07, data_08, data_09, data_010,
   data_011, data_012); print("I've removed the original files")


###################################################################################################
###                           Remove NA an Duplicates
###################################################################################################
# Remove Missing Values
totDb_Clean = totDb %>%
  filter(start_station_name!="") %>% 
  filter(end_station_name!="")
  
# remove duplicates
totDb_Clean = totDb_Clean[!duplicated(totDb_Clean$ride_id), ]
print(paste("Removed", nrow(totDb) - nrow(totDb_Clean), "duplicated rows"))

# Remove extra spaces, if any
totDb_Clean$started_at = str_squish(totDb_Clean$started_at)
totDb_Clean$ended_at   = str_squish(totDb_Clean$ended_at)`

###################################################################################################
###                           Convert to date and calculate the ride duration
###################################################################################################

This is where I am having troubles. I have tried to convert these variables to date in several ways but nothing seems to work. Here are some of my attempts:

1)

 totDb_Clean$date     = as.Date(totDb_Clean$started_at)
    totDb_Clean$week_day = format(as.Date(totDb_Clean$date), "%A")
    totDb_Clean$month    = format(as.Date(totDb_Clean$date), "%b_%y")
    totDb_Clean$year     =format(totDb_Clean$date,"%Y")
  1. totDb_Clean = totDb_Clean%>% mutate(started_at = ymd_hms(as_datetime(started_at)), ended_at = ymd_hms(as_datetime(ended_at)))

  2. Convert to date

    totDb_Clean2$started_at = as.POSIXct(totDb_Clean$started_at, tz = "GMT6", format = "%m/%d/%Y %H:%M") totDb_Clean2$ended_at = as.POSIXct(totDb_Clean$ended_at, tz = "GMT6", format = "%m/%d/%Y %H:%M")

None of these approaches seems to be working. Any help is appreciated, thank you for your time!

Glu
  • 327
  • 3
  • 14
  • Does this answer your question? [Date conversion from POSIXct to Date in R](https://stackoverflow.com/questions/16557028/date-conversion-from-posixct-to-date-in-r) – Matt Summersgill Jun 30 '23 at 14:41

0 Answers0