1

I am bringing filtered JSON data from Elastic as a nested list with multiple levels. I am interesting in converting the nested list into 2d in Rstudio for data analysis.

I have tried mutliple tips online but none have worked (plytr,rbind, unlist, tidyjson). I am desperate for help

Example of the structure:

> str(RAW2)
List of 10
 $ :List of 5
  ..$ _index: chr "xml-search-2017.06.12"
  ..$ _type : chr "xml"
  ..$ _id   : chr "76595087100_1"
  ..$ _score: num 13.5
  ..$ fields:List of 3
  .. ..$ check_in_date :List of 1
  .. .. ..$ : chr "2017-06-20T00:00:00.000Z"
  .. ..$ check_out_date:List of 1
  .. .. ..$ : chr "2017-06-23T00:00:00.000Z"
  .. ..$ ts_start      :List of 1
  .. .. ..$ : chr "2017-06-12T20:11:07.348Z"
 $ :List of 5
  ..$ _index: chr "xml-search-2017.06.12"
  ..$ _type : chr "xml"
  ..$ _id   : chr "76595087300_1"
  ..$ _score: num 13.5
  ..$ fields:List of 3
  .. ..$ check_in_date :List of 1
  .. .. ..$ : chr "2017-06-20T00:00:00.000Z"
  .. ..$ check_out_date:List of 1
  .. .. ..$ : chr "2017-06-23T00:00:00.000Z"
  .. ..$ ts_start      :List of 1
  .. .. ..$ : chr "2017-06-12T20:11:07.060Z"

Thank you

list(structure(list(`_index` = "xml-search-2017.06.12", `_type` = "xml", 
    `_id` = "76595087100_1", `_score` = 13.457847, fields = structure(list(
        check_in_date = list("2017-06-20T00:00:00.000Z"), check_out_date = list(
            "2017-06-23T00:00:00.000Z"), ts_start = list("2017-06-12T20:11:07.348Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76595087300_1", `_score` = 13.457847, 
    fields = structure(list(check_in_date = list("2017-06-20T00:00:00.000Z"), 
        check_out_date = list("2017-06-23T00:00:00.000Z"), ts_start = list(
            "2017-06-12T20:11:07.060Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76594844800_1", `_score` = 13.455816, 
    fields = structure(list(check_in_date = list("2017-06-20T00:00:00.000Z"), 
        check_out_date = list("2017-06-22T00:00:00.000Z"), ts_start = list(
            "2017-06-12T20:11:03.445Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76597264600_1", `_score` = 13.455816, 
    fields = structure(list(check_in_date = list("2017-06-13T00:00:00.000Z"), 
        check_out_date = list("2017-06-16T00:00:00.000Z"), ts_start = list(
            "2017-06-12T20:13:15.005Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76599762900_1", `_score` = 13.455723, 
    fields = structure(list(check_in_date = list("2017-06-22T00:00:00.000Z"), 
        check_out_date = list("2017-06-28T00:00:00.000Z"), ts_start = list(
            "2017-06-12T20:14:37.454Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76595493900_1", `_score` = 13.455723, 
    fields = structure(list(check_in_date = list("2017-06-20T00:00:00.000Z"), 
        check_out_date = list("2017-06-23T00:00:00.000Z"), ts_start = list(
            "2017-06-12T20:11:07.348Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76597065400_1", `_score` = 13.169026, 
    fields = structure(list(check_in_date = list("2017-06-13T00:00:00.000Z"), 
        check_out_date = list("2017-06-16T00:00:00.000Z"), ts_start = list(
            "2017-06-12T20:13:14.994Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76594777600_1", `_score` = 13.169026, 
    fields = structure(list(check_in_date = list("2017-06-20T00:00:00.000Z"), 
        check_out_date = list("2017-06-22T00:00:00.000Z"), ts_start = list(
            "2017-06-12T20:11:03.440Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76572404700_1", `_score` = 13.169026, 
    fields = structure(list(check_in_date = list("2017-06-13T00:00:00.000Z"), 
        check_out_date = list("2017-06-14T00:00:00.000Z"), ts_start = list(
            "2017-06-12T19:53:56.580Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76598481000_1", `_score` = 12.763965, 
    fields = structure(list(check_in_date = list("2017-06-22T00:00:00.000Z"), 
        check_out_date = list("2017-06-28T00:00:00.000Z"), ts_start = list(
            "2017-06-12T20:14:37.452Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")))

Subset 2:

list(structure(list(`_index` = "xml-search-2017.06.12", `_type` = "xml", 
    `_id` = "76452356700_1", `_score` = 2.390721, fields = structure(list(
        check_in_date = list("2017-06-28T00:00:00.000Z"), check_out_date = list(
            "2017-07-02T00:00:00.000Z"), ts_start = list("2017-06-12T18:19:21.311Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452363400_3", `_score` = 2.390721, 
    fields = structure(list(check_in_date = list("2017-06-20T00:00:00.000Z"), 
        check_out_date = list("2017-06-30T00:00:00.000Z"), ts_start = list(
            "2017-06-12T18:19:21.235Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452375900_1", `_score` = 2.390721, 
    fields = structure(list(check_in_date = list("2017-09-01T00:00:00.000Z"), 
        check_out_date = list("2017-09-03T00:00:00.000Z"), ts_start = list(
            "2017-06-12T18:19:30.092Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452377300_1", `_score` = 2.390721, 
    fields = structure(list(check_in_date = list("2017-06-22T00:00:00.000Z"), 
        check_out_date = list("2017-06-24T00:00:00.000Z"), ts_start = list(
            "2017-06-12T18:19:31.633Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452383100_1", `_score` = 2.390721, 
    fields = structure(list(hotel_name = list("Rodeway Inn South Miami", 
        "East Miami", "Holiday Inn Port Of Miami", "Mayfair Hotel & Spa", 
        "Intercontinental Miami", "Marriott Biscayne Bay", "Eb Hotel Miami", 
        "Doubletree Grand Biscayne Bay", "Hotel Beaux Arts Miami", 
        "Cambria Suites Miami Airport", "Epic Miami, A Kimpton Hotel", 
        "The Ritz-Carlton Coconut Grove", "Quality Inn Miami Airport", 
        "Hilton Miami Downtown", "Conrad Miami", "Miccosukee Resort & Gaming", 
        "Courtyard Downtown", "Jw Marriott Marquis Miami", "Miami Marriott Dadeland", 
        "Courtyard Miami Coral Gables"), check_in_date = list(
        "2017-08-26T00:00:00.000Z"), check_out_date = list("2017-08-29T00:00:00.000Z"), 
        ts_start = list("2017-06-12T18:19:30.198Z")), .Names = c("hotel_name", 
    "check_in_date", "check_out_date", "ts_start"))), .Names = c("_index", 
"_type", "_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452394200_1", `_score` = 2.390721, 
    fields = structure(list(hotel_name = list("Rosen Plaza"), 
        check_in_date = list("2017-06-24T00:00:00.000Z"), check_out_date = list(
            "2017-06-27T00:00:00.000Z"), ts_start = list("2017-06-12T18:19:31.672Z")), .Names = c("hotel_name", 
    "check_in_date", "check_out_date", "ts_start"))), .Names = c("_index", 
"_type", "_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452409700_2", `_score` = 2.390721, 
    fields = structure(list(hotel_name = list("Jw Marriott Hotel Mumbai Sahar"), 
        check_in_date = list("2018-03-12T00:00:00.000Z"), check_out_date = list(
            "2018-03-16T00:00:00.000Z"), ts_start = list("2017-06-12T18:19:46.007Z")), .Names = c("hotel_name", 
    "check_in_date", "check_out_date", "ts_start"))), .Names = c("_index", 
"_type", "_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452420500_1", `_score` = 2.390721, 
    fields = structure(list(hotel_name = list("Holiday Inn Express Troisdorf"), 
        check_in_date = list("2017-06-25T00:00:00.000Z"), check_out_date = list(
            "2017-06-28T00:00:00.000Z"), ts_start = list("2017-06-12T18:19:40.676Z")), .Names = c("hotel_name", 
    "check_in_date", "check_out_date", "ts_start"))), .Names = c("_index", 
"_type", "_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452421700_2", `_score` = 2.390721, 
    fields = structure(list(check_in_date = list("2017-06-23T00:00:00.000Z"), 
        check_out_date = list("2017-06-26T00:00:00.000Z"), ts_start = list(
            "2017-06-12T18:19:40.932Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452424300_2", `_score` = 2.390721, 
    fields = structure(list(hotel_name = list("Aston Balikpapan"), 
        check_in_date = list("2017-07-05T00:00:00.000Z"), check_out_date = list(
            "2017-07-06T00:00:00.000Z"), ts_start = list("2017-06-12T18:19:42.293Z")), .Names = c("hotel_name", 
    "check_in_date", "check_out_date", "ts_start"))), .Names = c("_index", 
"_type", "_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452425100_1", `_score` = 2.390721, 
    fields = structure(list(hotel_name = list("H10 Duque De Loule"), 
        check_in_date = list("2017-07-28T00:00:00.000Z"), check_out_date = list(
            "2017-07-30T00:00:00.000Z"), ts_start = list("2017-06-12T18:19:42.594Z")), .Names = c("hotel_name", 
    "check_in_date", "check_out_date", "ts_start"))), .Names = c("_index", 
"_type", "_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452425500_1", `_score` = 2.390721, 
    fields = structure(list(check_in_date = list("2017-06-25T00:00:00.000Z"), 
        check_out_date = list("2017-06-26T00:00:00.000Z"), ts_start = list(
            "2017-06-12T18:19:42.719Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452425600_1", `_score` = 2.390721, 
    fields = structure(list(check_in_date = list("2017-06-17T00:00:00.000Z"), 
        check_out_date = list("2017-06-19T00:00:00.000Z"), ts_start = list(
            "2017-06-12T18:19:42.748Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452427300_1", `_score` = 2.390721, 
    fields = structure(list(check_in_date = list("2017-06-17T00:00:00.000Z"), 
        check_out_date = list("2017-06-20T00:00:00.000Z"), ts_start = list(
            "2017-06-12T18:19:43.154Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452428100_10", `_score` = 2.390721, 
    fields = structure(list(check_in_date = list("2017-09-11T00:00:00.000Z"), 
        check_out_date = list("2017-09-24T00:00:00.000Z"), ts_start = list(
            "2017-06-12T18:19:43.345Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452428800_1", `_score` = 2.390721, 
    fields = structure(list(hotel_name = list("Ibis Istanbul City West"), 
        check_in_date = list("2017-06-30T00:00:00.000Z"), check_out_date = list(
            "2017-07-01T00:00:00.000Z"), ts_start = list("2017-06-12T18:19:43.761Z")), .Names = c("hotel_name", 
    "check_in_date", "check_out_date", "ts_start"))), .Names = c("_index", 
"_type", "_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452431500_1", `_score` = 2.390721, 
    fields = structure(list(hotel_name = list("The Rani Hotel And Spa"), 
        check_in_date = list("2017-07-07T00:00:00.000Z"), check_out_date = list(
            "2017-07-14T00:00:00.000Z"), ts_start = list("2017-06-12T18:19:45.460Z")), .Names = c("hotel_name", 
    "check_in_date", "check_out_date", "ts_start"))), .Names = c("_index", 
"_type", "_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452431700_1", `_score` = 2.390721, 
    fields = structure(list(check_in_date = list("2017-06-17T00:00:00.000Z"), 
        check_out_date = list("2017-06-20T00:00:00.000Z"), ts_start = list(
            "2017-06-12T18:19:45.642Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452434500_1", `_score` = 2.390721, 
    fields = structure(list(hotel_name = list("Cloitre St Louis", 
        "De L'Horloge", "Hotel D'Europe", "Danieli", "Bristol", 
        "Kyriad Palais Des Papes", "Regina"), check_in_date = list(
        "2017-06-24T00:00:00.000Z"), check_out_date = list("2017-06-25T00:00:00.000Z"), 
        ts_start = list("2017-06-12T18:19:47.037Z")), .Names = c("hotel_name", 
    "check_in_date", "check_out_date", "ts_start"))), .Names = c("_index", 
"_type", "_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452434700_2", `_score` = 2.390721, 
    fields = structure(list(check_in_date = list("2017-09-08T00:00:00.000Z"), 
        check_out_date = list("2017-09-20T00:00:00.000Z"), ts_start = list(
            "2017-06-12T18:19:47.086Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452447400_1", `_score` = 2.390721, 
    fields = structure(list(check_in_date = list("2017-09-25T00:00:00.000Z"), 
        check_out_date = list("2017-09-30T00:00:00.000Z"), ts_start = list(
            "2017-06-12T18:19:51.056Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452451400_1", `_score` = 2.390721, 
    fields = structure(list(check_in_date = list("2017-07-01T00:00:00.000Z"), 
        check_out_date = list("2017-07-04T00:00:00.000Z"), ts_start = list(
            "2017-06-12T18:19:40.306Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452452500_1", `_score` = 2.390721, 
    fields = structure(list(check_in_date = list("2017-06-18T00:00:00.000Z"), 
        check_out_date = list("2017-06-19T00:00:00.000Z"), ts_start = list(
            "2017-06-12T18:19:52.461Z")), .Names = c("check_in_date", 
    "check_out_date", "ts_start"))), .Names = c("_index", "_type", 
"_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452452800_1", `_score` = 2.390721, 
    fields = structure(list(hotel_name = list("Best Western Grand West'S Villas"), 
        check_in_date = list("2017-07-12T00:00:00.000Z"), check_out_date = list(
            "2017-07-14T00:00:00.000Z"), ts_start = list("2017-06-12T18:19:40.753Z")), .Names = c("hotel_name", 
    "check_in_date", "check_out_date", "ts_start"))), .Names = c("_index", 
"_type", "_id", "_score", "fields")), structure(list(`_index` = "xml-search-2017.06.12", 
    `_type` = "xml", `_id` = "76452453600_1", `_score` = 2.390721, 
    fields = structure(list(hotel_name = list("Athens Status Suites", 
        "Ambrosia Hotel & Suites", "A For Athens", "Athenswas", 
        "Kimon Athens Hotel", "Chic", "Athinais", "Novus City Hotel"), 
        check_in_date = list("2017-08-12T00:00:00.000Z"), check_out_date = list(
            "2017-08-25T00:00:00.000Z"), ts_start = list("2017-06-12T18:19:54.035Z")), .Names = c("hotel_name", 
    "check_in_date", "check_out_date", "ts_start"))), .Names = c("_index", 
"_type", "_id", "_score", "fields")))

2 Answers2

3

It's not totally clean yet, but here's an option:

library(tidyverse)

df <- raw2 %>% map_df(flatten) %>% unnest()

df
#> # A tibble: 10 x 7
#>                 `_index` `_type`         `_id` `_score`
#>                    <chr>   <chr>         <chr>    <dbl>
#>  1 xml-search-2017.06.12     xml 76595087100_1 13.45785
#>  2 xml-search-2017.06.12     xml 76595087300_1 13.45785
#>  3 xml-search-2017.06.12     xml 76594844800_1 13.45582
#>  4 xml-search-2017.06.12     xml 76597264600_1 13.45582
#>  5 xml-search-2017.06.12     xml 76599762900_1 13.45572
#>  6 xml-search-2017.06.12     xml 76595493900_1 13.45572
#>  7 xml-search-2017.06.12     xml 76597065400_1 13.16903
#>  8 xml-search-2017.06.12     xml 76594777600_1 13.16903
#>  9 xml-search-2017.06.12     xml 76572404700_1 13.16903
#> 10 xml-search-2017.06.12     xml 76598481000_1 12.76397
#> # ... with 3 more variables: check_in_date <chr>, check_out_date <chr>,
#> #   ts_start <chr>
alistaire
  • 42,459
  • 4
  • 77
  • 117
  • Those `map*` functions of `purrr` seem very handy – Sotos Jun 17 '17 at 14:10
  • 1
    Yeah, I love `map_df`. Now that I've seen it, it seems like a huge omission that the base versions can't simplify directly to a data.frame. – alistaire Jun 17 '17 at 14:14
  • @alistaire that looks very promising! worked for the example I provided, but when trying similar for a different sub-set I get an error. Have attached the new subset on the main text. Could it be because some records have larger number of fields? (ie, some have x number of hotel names) – Daniel Delgado Alba Jun 17 '17 at 14:43
  • Yeah, the new variable is both coming and going (and so can be `NULL`) and is of variable length. Maybe `raw3 %>% map(flatten) %>% map_df(as_data_frame) %>% mutate(hotel_name = map(hotel_name, ~.x %||% NA)) %>% unnest()` – alistaire Jun 17 '17 at 14:54
  • @alistaire, that is right, the data contains hotel searches and each search could have 0:n hotels. (same for other fields). I tried with the code above you advised but I'm getting error. "Error in bind_rows_(x, .id) : Can not automatically convert from character to numeric in column "value"." My objective is to aggregate hotels and run analysis on the number of searches for each. Can you think of a way to extract those 0:n hotels within each record and aggregate? Really appreciate your help! – Daniel Delgado Alba Jun 18 '17 at 08:57
1

A base R version could be,

do.call(rbind, lapply(l1, function(i) as.data.frame(t(unlist(i)))))
Sotos
  • 51,121
  • 6
  • 32
  • 66