15

We have loaded json blobs in a String field in a Bigquery table. I need to create a view (using standard sql)over the table that would extract the array field as a bigquery array/repeated field of "RECORD" type (which itself includes a repeated field).

Here is a sample record (json_blob):

{"order_id":"123456","customer_id":"2abcd", "items":[{"line":"1","ref_ids":["66b56e60","9e7ca2b7"],"sku":"1111","amount":40 },{"line":"2","ref_ids":["7777h0","8888j0"],"sku":"2222","amount":10 }]}

I am hoping to end up with a view that has the following layout:

[
{
    "name": "order_id",
    "type": "STRING",
    "mode": "NULLABLE"
},
{
    "mode": "NULLABLE",
    "name": "customer_id",
    "type": "STRING"
},
{
    "mode": "REPEATED",
    "name": "items",
    "type": "RECORD",
    "fields": [
        {
            "mode": "NULLABLE",
            "name": "line",
            "type": "STRING"
        },
        {
            "mode": "REPEATED",
            "name": "ref_ids",
            "type": "STRING"
        },
        {
            "mode": "NULLABLE",
            "name": "sku",
            "type": "STRING"
        },
        {
            "mode": "NULLABLE",
            "name": "amount",
            "type": "INTEGER"
        }
    ]
}
]

Json_extract(json_blob, '$.items') extracts the items parts, but how do I convert that to a bigquery array of type "RECORD" which then can be processed like normal bigquery array/repeated of STRUCT?

Appreciate any help.

FZF
  • 855
  • 4
  • 12
  • 29

3 Answers3

14

There is no way to do this using SQL functions in BigQuery at the time of this writing unless you can impose a hard limit on the number of values in the JSON array; see the relevant issue tracker item. Your options are:

  • Process the data differently (e.g. using Cloud Dataflow or another tool) so that you can load it from newline-delimited JSON into BigQuery.
  • Use a JavaScript UDF that takes the input JSON and returns the desired type; this is fairly straightforward but generally uses more CPU (and hence may require a higher billing tier).
  • Use SQL functions with the understanding that the solution breaks down if there are too many elements.

Here is the approach using a JavaScript UDF:

#standardSQL
CREATE TEMP FUNCTION JsonToItems(input STRING)
RETURNS STRUCT<order_id INT64, customer_id STRING, items ARRAY<STRUCT<line STRING, ref_ids ARRAY<STRING>, sku STRING, amount INT64>>>
LANGUAGE js AS """
return JSON.parse(input);
""";

WITH Input AS (
  SELECT '{"order_id":"123456","customer_id":"2abcd", "items":[{"line":"1","ref_ids":["66b56e60","9e7ca2b7"],"sku":"1111","amount":40 },{"line":"2","ref_ids":["7777h0","8888j0"],"sku":"2222","amount":10 }]}' AS json
)
SELECT
  JsonToItems(json).*
FROM Input;

If you do want to try the SQL-based approach without JavaScript, here's somewhat of a hack until the feature request above is resolved, where the number of array elements must be no more than 10:

#standardSQL
CREATE TEMP FUNCTION JsonExtractRefIds(json STRING) AS (
  (SELECT ARRAY_AGG(v IGNORE NULLS)
   FROM UNNEST([
     JSON_EXTRACT_SCALAR(json, '$.ref_ids[0]'),
     JSON_EXTRACT_SCALAR(json, '$.ref_ids[1]'),
     JSON_EXTRACT_SCALAR(json, '$.ref_ids[2]'),
     JSON_EXTRACT_SCALAR(json, '$.ref_ids[3]'),
     JSON_EXTRACT_SCALAR(json, '$.ref_ids[4]'),
     JSON_EXTRACT_SCALAR(json, '$.ref_ids[5]'),
     JSON_EXTRACT_SCALAR(json, '$.ref_ids[6]'),
     JSON_EXTRACT_SCALAR(json, '$.ref_ids[7]'),
     JSON_EXTRACT_SCALAR(json, '$.ref_ids[8]'),
     JSON_EXTRACT_SCALAR(json, '$.ref_ids[9]')]) AS v)
);

CREATE TEMP FUNCTION JsonToItem(json STRING)
RETURNS STRUCT<line STRING, ref_ids ARRAY<STRING>, sku STRING, amount INT64>
AS (
  IF(json IS NULL, NULL,
    STRUCT(
      JSON_EXTRACT_SCALAR(json, '$.line'),
      JsonExtractRefIds(json),
      JSON_EXTRACT_SCALAR(json, '$.sku'),
      CAST(JSON_EXTRACT_SCALAR(json, '$.amount') AS INT64)
    )
  )
);

CREATE TEMP FUNCTION JsonToItems(json STRING) AS (
  (SELECT AS STRUCT
    CAST(JSON_EXTRACT_SCALAR(json, '$.order_id') AS INT64) AS order_id,
    JSON_EXTRACT_SCALAR(json, '$.customer_id') AS customer_id,
    (SELECT ARRAY_AGG(v IGNORE NULLS)
     FROM UNNEST([
       JsonToItem(JSON_EXTRACT(json, '$.items[0]')),
       JsonToItem(JSON_EXTRACT(json, '$.items[1]')),
       JsonToItem(JSON_EXTRACT(json, '$.items[2]')),
       JsonToItem(JSON_EXTRACT(json, '$.items[3]')),
       JsonToItem(JSON_EXTRACT(json, '$.items[4]')),
       JsonToItem(JSON_EXTRACT(json, '$.items[5]')),
       JsonToItem(JSON_EXTRACT(json, '$.items[6]')),
       JsonToItem(JSON_EXTRACT(json, '$.items[7]')),
       JsonToItem(JSON_EXTRACT(json, '$.items[8]')),
       JsonToItem(JSON_EXTRACT(json, '$.items[9]'))]) AS v) AS items
  )
);

WITH Input AS (
  SELECT '{"order_id":"123456","customer_id":"2abcd", "items":[{"line":"1","ref_ids":["66b56e60","9e7ca2b7"],"sku":"1111","amount":40 },{"line":"2","ref_ids":["7777h0","8888j0"],"sku":"2222","amount":10 }]}' AS json
)
SELECT
  JsonToItems(json).*
FROM Input;
Elliott Brossard
  • 32,095
  • 2
  • 67
  • 99
  • Very cool answer (and hack)! Just got curious to ask about the array length having to be no bigger then 10. Is it due memory consumption? I tested here with jsons having more than 10 and it worked normally so I assume this is something related to memory maybe. – Willian Fuks Jul 26 '17 at 07:09
  • It's because I hardcoded the paths used to extract the array elements (the path is required to be constant). You'll only get 10 items even if there are 11 in that array, for instance. – Elliott Brossard Jul 26 '17 at 13:42
14

A little bit more brute-force version - I think easier to read and modify/adjust if needed

#standardSQL
WITH `yourTable` AS (
  SELECT '{"order_id":"123456","customer_id":"2abcd", "items":[{"line":"1","ref_ids":["66b56e60","9e7ca2b7"],"sku":"1111","amount":40 },{"line":"2","ref_ids":["7777h0","8888j0"],"sku":"2222","amount":10 }]}' AS json_blob
)
SELECT 
   JSON_EXTRACT_SCALAR(json_blob, '$.order_id') AS order_id,
   JSON_EXTRACT_SCALAR(json_blob, '$.customer_id') AS customer_id,
   ARRAY(
    SELECT STRUCT(
        JSON_EXTRACT_SCALAR(split_items, '$.line') AS line,
        SPLIT(REGEXP_REPLACE(JSON_EXTRACT (split_items, '$.ref_ids'), r'[\[\]\"]', '')) AS ref_ids,
        JSON_EXTRACT_SCALAR(split_items, '$.sku') AS sku,
        JSON_EXTRACT_SCALAR(split_items, '$.amount') AS amount
      )
    FROM (
      SELECT CONCAT('{', REGEXP_REPLACE(split_items, r'^\[{|}\]$', ''), '}') AS split_items
      FROM UNNEST(SPLIT(JSON_EXTRACT(json_blob, '$.items'), '},{')) AS split_items
    )
   ) AS items
FROM `yourTable` 
Mikhail Berlyant
  • 165,386
  • 8
  • 154
  • 230
8

As of 1st May 2020, JSON_EXTRACT_ARRAY function has been added, and can be used to retrieve array from json.

#standardSQL
WITH `yourTable` AS (
  SELECT '{"order_id":"123456","customer_id":"2abcd", "items":[{"line":"1","ref_ids":["66b56e60","9e7ca2b7"],"sku":"1111","amount":40 },{"line":"2","ref_ids":["7777h0","8888j0"],"sku":"2222","amount":10 }]}' AS json_blob 
)
SELECT
  json_extract_scalar(json_blob,'$.order_id') AS order_id,
  json_extract_scalar(json_blob,'$.customer_id') AS customer_id,
  ARRAY(
  SELECT
    STRUCT(json_extract_scalar(split_items,'$.line') AS line,
          ARRAY(SELECT json_extract_scalar(ref_element,'$') FROM UNNEST(json_extract_array(split_items, '$.ref_ids')) ref_element) AS ref_ids,
          json_extract_scalar(split_items,'$.sku') AS sku,
          json_extract_scalar(split_items,'$.amount') AS amount 
      )
    FROM UNNEST(json_extract_array(json_blob,'$.items')) split_items 
  ) AS items
FROM
  `yourTable`

Returns:

enter image description here

To get only the type query would be:

#standardSQL
WITH `yourTable` AS (
  SELECT '{ "firstName": "John", "lastName" : "doe", "age"      : 26, "address"  : {     "streetAddress": "naist street",     "city"         : "Nara",     "postalCode"   : "630-0192" }, "phoneNumbers": [     {       "type"  : "iPhone",       "number": "0123-4567-8888"     },     {       "type"  : "home",       "number": "0123-4567-8910"     } ]}' AS json_blob 
)
  SELECT
    json_extract_scalar(split_items,'$.type') AS type FROM `yourTable`, UNNEST(json_extract_array(json_blob,'$.phoneNumbers')) split_items

returns:

enter image description here

Pentium10
  • 204,586
  • 122
  • 423
  • 502