9

I have the JSON file in a .txt file which I'm trying to load into R but I'm getting the following error:

Error in feed_push_parser(readBin(con, raw(), n), reset = TRUE) : 
  parse error: trailing garbage
      " : "SUCCESS"  }    /* 1 */  {    "_id" : "b736c374-b8ae-4e9
                 (right here) ------^

I'm assuming the error is because of multiple instances of /* (number) */ and I can't manually remove them all as my file has 10k instances of these. Is there a way to remove such instances before loading the data into R?

My JSON file looks like below:

/* 0 */
{
  "_id" : "93ccbdb6-8947",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1KKP",
    "queryId" : "93ccbdb6-8947",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 0,
  "requestDate" : 20151205,
  "totalRecords" : 0,
  "status" : "SUCCESS"
}

/* 1 */
{
  "_id" : "b736c374-b8ae",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1KKP",
    "queryId" : "b736c374-b8ae",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 0,
  "requestDate" : 20151205,
  "totalRecords" : 0,
  "status" : "SUCCESS"
}

/* 2 */
{
  "_id" : "3312605f-8304",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1SXE",
    "queryId" : "3312605f-8304",
    "subRequests" : [{
        "origin" : "LON",
        "destination" : "IAD",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 2,
  "requestDate" : 20151205,
  "totalRecords" : 0,
  "status" : "SUCCESS"
}

/* 3 */
{
  "_id" : "6b668cfa-9b79",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1NXA",
    "queryId" : "6b668cfa-9b79",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 1,
  "requestDate" : 20151205,
  "totalRecords" : 1388,
  "status" : "SUCCESS"
}

/* 4 */
{
  "_id" : "41c373a1-e4cb",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP6CXS",
    "queryId" : "41c373a1-e4cb",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 0,
  "requestDate" : 20151205,
  "totalRecords" : 1388,
  "status" : "SUCCESS"
}

/* 5 */
{
  "_id" : "2c8331c4-21ca",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1KKP",
    "queryId" : "2c8331c4-21ca",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 0,
  "requestDate" : 20151205,
  "totalRecords" : 1388,
  "status" : "SUCCESS"
}

/* 6 */
{
  "_id" : "71a09900-1c13",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP6CXS",
    "queryId" : "71a09900-1c13",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AF",
        "fareClasses" : "",
        "owrt" : "1,2"
      }, {
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }, {
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "DL",
        "fareClasses" : "",
        "owrt" : "1,2"
      }, {
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "LH",
        "fareClasses" : "",
        "owrt" : "1,2"
      }, {
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "BA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 0,
  "requestDate" : 20151205,
  "totalRecords" : 6941,
  "status" : "SUCCESS"
}

/* 7 */
{
  "_id" : "a036a42a-918b",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1MMM",
    "queryId" : "a036a42a-918b",
    "subRequests" : [{
        "origin" : "WAS",
        "destination" : "LON",
        "carrier" : "AA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 0,
  "requestDate" : 20151205,
  "totalRecords" : 1388,
  "status" : "SUCCESS"
}

/* 8 */
{
  "_id" : "c547be36-805c",
  "uiSearchRequest" : {
    "travelDate" : 20151206,
    "travelDuration" : 7,
    "shopperDuration" : 30,
    "oneWay" : false,
    "userId" : "ATP1SXB",
    "queryId" : "c547be36-805c",
    "subRequests" : [{
        "origin" : "CHI",
        "destination" : "LON",
        "carrier" : "BA",
        "fareClasses" : "",
        "owrt" : "1,2"
      }]
  },
  "downloadCount" : 2,
  "requestDate" : 20151205,
  "totalRecords" : 1072,
  "status" : "SUCCESS"
}

My code is below (Although I haven't gotten much far):

library(jsonlite)
library(RJSONIO)
json_data_raw<-fromJSON("mydata.txt")

json_file <- lapply(json_data_raw, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})

output <-- do.call("rbind", json_file)
write.csv(a, file="json.csv",row.names = FALSE)
file.show("json.csv")

I'm trying to get my output into a CSV file like below

I'm trying to get output into a CSV file like below

Lester Pereira
  • 325
  • 1
  • 5
  • 13
  • 2
    How exactly do you want to collapse this nested structure to a rectangular CSV file? What precisely would the output look like? You should include sample data in the question itself to [make the problem reproducible](http://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example). Pictures of data aren't as useful. – MrFlick Jun 23 '16 at 16:15
  • `fromJSON(grep('/\\*.*\\*/', readLines(mydata.txt), invert = TRUE, value = TRUE))` gets rid of the comment lines, but still fails for me. – alistaire Jun 23 '16 at 16:38
  • @MrFlick thanks for the suggestion. I've made the changes. – Lester Pereira Jun 23 '16 at 17:25

2 Answers2

11

There are several issues with your text file. As you already noticed, you need to remove the lines of the form /* 0 */. What results is still not valid json. If you want to have several json objects in a file, you need to store them in an array. The json objects are the parts that are closed in curly brakets, e.g.,

{
  "_id" : "93ccbdb6-8947-4687-8e12-edf4e40d6650",
  ...
  "totalRecords" : 0,
  "status" : "SUCCESS"
}

The structure of an array of objects is as follows:

[
  {
    ...
  },
  {
    ...
  }
]

To get your file in shape, you need to add a comma between the objects and add the square brackets. You could do this as follows:

raw <- readLines("mydata.txt")

# get rid of the "/* 0 */" lines
json <- grep("^/\\* [0-9]* \\*/", raw, value = TRUE, invert = TRUE)

# add missing comma after }
n <- length(json)
json[-n] <- gsub("^}$", "},", json[-n])

# add brakets at the beginning and end
json <- c("[", json, "]")

This can be read by fromJSON(), so I assume it is valid json:

library(jsonlite)
table <- fromJSON(json)

The table is nested, that is, some of the tables cells contain a data frame or a list themselves. For example,

table[1,2]
##   travelDate travelDuration shopperDuration oneWay  userId                              queryId
## 1   20151206              7              30  FALSE ATP1KKP 93ccbdb6-8947-4687-8e12-edf4e40d6650
##               subRequests
##     1 WAS, LON, AA, , 1,2

You could use flatten() from the jsonlite package, to get a table with one level of nesting less

flatten(table)[1:3, c(1, 6, 12)]
##                                    _id uiSearchRequest.travelDate uiSearchRequest.subRequests
## 1 93ccbdb6-8947-4687-8e12-edf4e40d6650                   20151206         WAS, LON, AA, , 1,2
## 2 b736c374-b8ae-4e99-8073-9c54517fecd5                   20151206         WAS, LON, AA, , 1,2
## 3 3312605f-8304-4ab8-96d6-6e1a03cfbd9e                   20151206         LON, IAD, AA, , 1,2

The last column is still a list. There are many ways you could handle this. One possibility, is to create a row per subrequest, where the contents of all the other columns (X_id, downloadCount, etc.) are repeated. (This is almost the form that you give in your question, with the only difference that you left cells empty in the reapeated columns, while I repeat the contents.) This is how it can be done:

table <- flatten(fromJSON(json))
tab_list <- lapply(1:nrow(table),
                  function(i) data.frame(table[i, -12], table[i, 12],
                              stringsAsFactors = FALSE))
library(dplyr)
flat_table <- bind_rows(tab_list)

The second line creates a list of data frames. These are combined into a single data frame using bind_rows() from dpylr. (To be more precise, flat_table will be a tbl_df, but the difference to a data.frame is small.) This can then be written to a csv file in the usual way:

write.csv(flat_table, file = "mydata.csv")
Stibu
  • 15,166
  • 6
  • 57
  • 71
  • That worked fine @Stibu, thanks but it is part of the solution! I'm having problem exporting the contents of the table to .csv or .xlsx file. Do you know how should I fix this error? Error in .jcall(cell, "V", "setCellValue", value) : method setCellValue with signature ([Ljava/lang/String;)V not found In addition: Warning message: In if (is.na(value)) { : the condition has length > 1 and only the first element will be used – Lester Pereira Jun 28 '16 at 20:20
  • The result of `flatten(table)` can indeed not be written to a csv file with `write.csv()`, because that function can not cope with the fact that one of the columns of the data frame is a list. I have added a few more lines of code that make sure that you really get a single data frame that can be written to a csv file. – Stibu Jun 28 '16 at 21:04
  • I'm not familiar with the error message in you comment. Since it mentions Java, I assume that you are using some package to create Excel files. So, you probably have an issue with how you use that package. Maybe the problem is also the list that is contained in the data frame and the problem will go away, once you use the additional code that I provided. If not, you should ask a new and specific question about your problem when creating Excel file (after you searched SO to check, whether the question has already been asked, of course...) – Stibu Jun 28 '16 at 21:06
  • When I run tab_list <- lapply(1:nrow(table), function(i) data.frame(table[i, -12], table[i, 12], stringsAsFactors = FALSE)) I'm getting the following error: Error in data.frame(table[i, -12], table[i, 12], stringsAsFactors = FALSE) : arguments imply differing number of rows: 1, 0 – Lester Pereira Jun 28 '16 at 21:21
  • I get this error message if I define `table` as `table <- fromJSON(json)`. But it should be `table <- flatten(fromJSON(json))`? – Stibu Jun 30 '16 at 08:06
  • table <- fromJSON(json) works for me. I've got a solution for the same. I will edit your solution and accept it as the answer. Thanks. – Lester Pereira Jun 30 '16 at 15:21
  • Reason for my error was my end result dataframe contains nested dataframes for the nested subrequest portion in the json file. So even base functions write.table() and write.csv() including the xlsx package's write.xlsx() will fail in outputting to flat formats. I considered flattening by binding the subrequests data frame and merging them to larger final1 columns using the row's id variable. Ultimately, I obtained a dataframe of 13 observations (not the 9 elements from json, because one of them contained 5 nested subrequests: id = 71a09900-1c13) – Lester Pereira Jun 30 '16 at 15:33
  • 1
    Well, apparently `table <- fromJSON(json)` did not work because you got an error message, that you wouldn't have had with `table <- flatten(fromJSON(json))`. The code in my answer gives the same result as your edit (apart from the column names and the order of the rows), but is much simpler. – Stibu Jun 30 '16 at 16:47
1

It's very simple in Python:

import pandas as pd
data = pd.read_json(path_to_input_file)
data.to_csv(path_to_csv_output_file)
showdev
  • 28,454
  • 37
  • 55
  • 73
Pramod Shinde
  • 107
  • 1
  • 2
  • 5
    The question asks for a solution in R, not Pyton. – tospig Dec 05 '19 at 08:58
  • 3
    Just now I was working on converting JSON to CSV using R. However, I was still struggling with the above suggestion and found some cool solution using Pandas. So thought to share it here. :) – Pramod Shinde Dec 05 '19 at 09:13