2

I am trying to remove the lat/long coordinates in the string below to place into two separate columns "lat" and "long" in R. I haven't had much luck using separate in dplyr. Any help would be most appreciated.

data_clean <- c("7205 MILLS CIVIC PKWY\nWEST DES MOINES 50266\n(41.561342, -93.806489)",
                "305 AIRPORT RD\nAMES 50010\n(42.001123, -93.61365)",                   
                "210 EAST TOWER PARK DR\nWATERLOO 50702\n(42.456362, -92.352552)")      

data_clean_df <- as.data.frame(data_clean)
Phil
  • 7,287
  • 3
  • 36
  • 66
dlarson
  • 21
  • 1
  • I think you should be able to find your answer here: https://stackoverflow.com/questions/6208367/regex-to-match-stuff-between-parentheses – TTS Dec 24 '19 at 20:20

4 Answers4

2

We can use tidyr::extract dividing the data in data_clean into 3 groups.

library(dplyr)
library(tidyr)

data_clean_df %>%
   mutate(data_clean = gsub('\n', '', data_clean)) %>%
   extract(data_clean, into = c('address', 'lat', 'lon'), 
      regex = '(.*)\\((.*),\\s+(.*)\\)', convert = TRUE)

#                                    data_clean     lat      lon
#1 7205 MILLS CIVIC PKWY\nWEST DES MOINES 50266 41.5613 -93.8065
#2                   305 AIRPORT RD\nAMES 50010 42.0011 -93.6137
#3       210 EAST TOWER PARK DR\nWATERLOO 50702 42.4564 -92.3526
Ronak Shah
  • 377,200
  • 20
  • 156
  • 213
1
library(dplyr)
library(tidyr)
library(stringr)

data_clean_df %>% 
  separate(data_clean, into = c("a", "b", "c"), sep = "\n") %>% 
  mutate(c = str_remove_all(c, "\\(|\\)")) %>%
  separate(c, c("lat", "lon"), sep = ", ", convert = TRUE)

                       a                     b      lat       lon
1  7205 MILLS CIVIC PKWY WEST DES MOINES 50266 41.56134 -93.80649
2         305 AIRPORT RD            AMES 50010 42.00112 -93.61365
3 210 EAST TOWER PARK DR        WATERLOO 50702 42.45636 -92.35255
Phil
  • 7,287
  • 3
  • 36
  • 66
1

Another option if you just want to pull out lat and long:

library(tidyverse)

data_clean <- c("7205 MILLS CIVIC PKWY\nWEST DES MOINES 50266\n(41.561342, -93.806489)",
                "305 AIRPORT RD\nAMES 50010\n(42.001123, -93.61365)",                   
                "210 EAST TOWER PARK DR\nWATERLOO 50702\n(42.456362, -92.352552)")      

data_clean_df <- as.data.frame(data_clean, stringsAsFactors = F)

data_clean_df %>%
  mutate(lat = str_extract(data_clean, "(?<=\\().*?(?=,)"),
         long = str_extract(data_clean, paste0("(?<=", lat, ",\\s).*?(?=\\))")))
#>                                                              data_clean
#> 1 7205 MILLS CIVIC PKWY\nWEST DES MOINES 50266\n(41.561342, -93.806489)
#> 2                    305 AIRPORT RD\nAMES 50010\n(42.001123, -93.61365)
#> 3       210 EAST TOWER PARK DR\nWATERLOO 50702\n(42.456362, -92.352552)
#>         lat       long
#> 1 41.561342 -93.806489
#> 2 42.001123  -93.61365
#> 3 42.456362 -92.352552
AndS.
  • 7,748
  • 2
  • 12
  • 17
1

Here is a base R solution using gsub()

df <- data.frame(data_clean = gsub("(.*)\n.*","\\1",data_clean),
                 lat = gsub(".*?\\((.*),.*","\\1",data_clean),
                 lon = gsub(".*,(.*)\\)","\\1",data_clean))

such that

                                    data_clean       lat         lon
1 7205 MILLS CIVIC PKWY\nWEST DES MOINES 50266 41.561342  -93.806489
2                   305 AIRPORT RD\nAMES 50010 42.001123   -93.61365
3       210 EAST TOWER PARK DR\nWATERLOO 50702 42.456362  -92.352552
ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81