1

I am using spark 3.1 and trying to read a JSON file

I have defined the schema for below file as:

StructType([
        StructField('search_metadata', MapType(StringType(),StringType())),
        StructField('search_parameters', MapType(StringType(),StringType())),
        StructField('search_information', MapType(StringType(),StringType())),
        StructField('local_results',StructType([
                                    StructField('position', StringType(), True),
                                    StructField('title', StringType(), True),
                                    StructField('place_id', StringType(), True),
                                    StructField('data_id', StringType(), True),
                                    StructField('data_cid', StringType(), True),
                                    StructField('reviews_link', StringType(), True),
                                    StructField('photos_link', StringType(), True),
                                    StructField('gps_coordinates', MapType(StringType(),StringType()), True),
                                    StructField('place_id_search', StringType(), True),
                                    StructField('unclaimed_listing', BinaryType(), True),
                                    StructField('type', StringType(), True),
                                    StructField('address', StringType(), True),
                                    StructField('open_state', StringType(), True),
                                    StructField('hours', StringType(), True),
                                    StructField('phone', MapType(StringType(),StringType()), True),
                                    StructField('thumbnail', StringType(), True),
                                        ]), True),
        StructField('serpapi_pagination',MapType(StringType(),StringType())),
        StructField('search_query', StringType(), True),
        ]) 

And my JSON file is:

[{
    "search_metadata": {
        "id": "63560cab66440a949ade5d72",
        "status": "Success",
        "json_endpoint": "https://serpapi.com/searches/b6986ff9ff715b13/63560cab66440a949ade5d72.json",
        "created_at": "2022-10-24 03:55:23 UTC",
        "processed_at": "2022-10-24 03:55:23 UTC",
        "google_maps_url": "https://www.google.com/maps/search/WH?hl=en",
        "raw_html_file": "https://serpapi.com/searches/b6986ff9ff715b13/63560cab66440a949ade5d72.html",
        "total_time_taken": 1.91
    },
    "search_parameters": {
        "engine": "google_maps",
        "type": "search",
        "q": "WH",
        "google_domain": "google.com",
        "hl": "en"
    },
    "search_information": {
        "local_results_state": "Results for exact spelling",
        "query_displayed": "WH"
    },
    "local_results": [{
            "position": 1,
            "title": "WH International Casting, LLC",
            "place_id": "ChIJh0wvXcu_a4gRWuH-O1ltlPg",
            "data_id": "0x886bbfcb5d2f4c87:0xf8946d593bfee15a",
            "data_cid": "17912061847985381722",
            "reviews_link": "https://serpapi.com/search.json?data_id=0x886bbfcb5d2f4c87%3A0xf8946d593bfee15a&engine=google_maps_reviews&hl=en",
            "photos_link": "https://serpapi.com/search.json?data_id=0x886bbfcb5d2f4c87%3A0xf8946d593bfee15a&engine=google_maps_photos&hl=en",
            "gps_coordinates": {
                "latitude": 38.295865,
                "longitude": -85.73001099999999
            },
            "place_id_search": "https://serpapi.com/search.json?data=%214m5%213m4%211s0x886bbfcb5d2f4c87%3A0xf8946d593bfee15a%218m2%213d38.295865%214d-85.73001099999999&engine=google_maps&google_domain=google.com&hl=en&type=place",
            "unclaimed_listing": true,
            "type": "Warehouse",
            "address": "260 America Pl Dr, Jeffersonville, IN 47130",
            "open_state": "Closed ⋅ Opens 8AM Mon",
            "hours": "Closed ⋅ Opens 8AM Mon",
            "operating_hours": {
                "sunday": "Closed",
                "monday": "8AM–4:30PM",
                "tuesday": "8AM–4:30PM",
                "wednesday": "8AM–4:30PM",
                "thursday": "8AM–4:30PM",
                "friday": "8AM–4:30PM",
                "saturday": "Closed"
            },
            "phone": "(812) 725-8029",
            "thumbnail": "https://lh5.googleusercontent.com/p/AF1QipPWDyyzxp1MG27vv3WVZbzy5WVI-Qh2u2jEDb-C=w122-h92-k-no"
        },
        {
            "position": 2,
            "title": "W.H. Smith Manor",
            "place_id": "ChIJ9584e22DXIgR5w2f2saKBOU",
            "data_id": "0x885c836d7b389ff7:0xe5048ac6da9f0de7",
            "data_cid": "16502467521268354535",
            "reviews_link": "https://serpapi.com/search.json?data_id=0x885c836d7b389ff7%3A0xe5048ac6da9f0de7&engine=google_maps_reviews&hl=en",
            "photos_link": "https://serpapi.com/search.json?data_id=0x885c836d7b389ff7%3A0xe5048ac6da9f0de7&engine=google_maps_photos&hl=en",
            "gps_coordinates": {
                "latitude": 36.581589799999996,
                "longitude": -83.6581731
            },
            "place_id_search": "https://serpapi.com/search.json?data=%214m5%213m4%211s0x885c836d7b389ff7%3A0xe5048ac6da9f0de7%218m2%213d36.581589799999996%214d-83.6581731&engine=google_maps&google_domain=google.com&hl=en&type=place",
            "unclaimed_listing": true,
            "type": "University department",
            "address": "184 Robertson Ave, Harrogate, TN 37752",
            "open_state": "Closed ⋅ Opens 8AM Mon",
            "hours": "Closed ⋅ Opens 8AM Mon",
            "operating_hours": {
                "sunday": "Closed",
                "monday": "8AM–4:30PM",
                "tuesday": "8AM–4:30PM",
                "wednesday": "8AM–4:30PM",
                "thursday": "8AM–4:30PM",
                "friday": "8AM–4:30PM",
                "saturday": "Closed"
            },
            "phone": "(423) 869-3611",
            "website": "http://lmunet.edu/",
            "thumbnail": "https://streetviewpixels-pa.googleapis.com/v1/thumbnail?panoid=mJwpOER-2yIbmD3xSwQ2pQ&cb_client=search.gws-prod.gps&w=80&h=92&yaw=307.97266&pitch=0&thumbfov=100"
        }

    ],
    "serpapi_pagination": {
        "next": "https://serpapi.com/search.json?engine=google_maps&google_domain=google.com&hl=en&q=WH&start=20&type=search"
    },
    "search_query": "WH.json"
}]

I am trying to select some rows as below, but I am getting null values, I am assuming there is an issue with my schema definition.

df = df.select(col('local_results'),
               col('local_results.position').alias('position'), 
               col('local_results.title').alias('title'))
df.show()

Can someone help me to correct the schema and show the results?

ZygD
  • 22,092
  • 39
  • 79
  • 102
Xi12
  • 939
  • 2
  • 14
  • 27

1 Answers1

0

You get nulls, because your schema definition is not correct.

To find out correct schema, you can try reading the file without schema like this:

df = spark.read.option('multiline', 'true').json('file.json')

Then you will be able to modify the schema in places which can be modified.

Using the above schema, the select will works:

from pyspark.sql import functions as F

df = df.select(F.col('local_results'),
               F.col('local_results.position').alias('position'), 
               F.col('local_results.title').alias('title'))
df.show()
# +--------------------+--------+--------------------+
# |       local_results|position|               title|
# +--------------------+--------+--------------------+
# |[{260 America Pl ...|  [1, 2]|[WH International...|
# +--------------------+--------+--------------------+

Schemas

Using spark.read.option('multiline', 'true').json('file.json'), this schema is created:

StructType([
    StructField('local_results', ArrayType(StructType([
        StructField('address', StringType(), True), 
        StructField('data_cid', StringType(), True), 
        StructField('data_id', StringType(), True), 
        StructField('gps_coordinates', StructType([
            StructField('latitude', DoubleType(), True), 
            StructField('longitude', DoubleType(), True)
        ]), True), 
        StructField('hours', StringType(), True), 
        StructField('open_state', StringType(), True), 
        StructField('operating_hours', StructType([
            StructField('friday', StringType(), True), 
            StructField('monday', StringType(), True), 
            StructField('saturday', StringType(), True), 
            StructField('sunday', StringType(), True), 
            StructField('thursday', StringType(), True), 
            StructField('tuesday', StringType(), True), 
            StructField('wednesday', StringType(), True)
        ]), True), 
        StructField('phone', StringType(), True), 
        StructField('photos_link', StringType(), True), 
        StructField('place_id', StringType(), True), 
        StructField('place_id_search', StringType(), True), 
        StructField('position', LongType(), True), 
        StructField('reviews_link', StringType(), True),
        StructField('thumbnail', StringType(), True), 
        StructField('title', StringType(), True), 
        StructField('type', StringType(), True), 
        StructField('unclaimed_listing', BooleanType(), True), 
        StructField('website', StringType(), True)
    ]), True), True), 
    StructField('search_information', StructType([
        StructField('local_results_state', StringType(), True), 
        StructField('query_displayed', StringType(), True)
    ]), True), 
    StructField('search_metadata', StructType([
        StructField('created_at', StringType(), True), 
        StructField('google_maps_url', StringType(), True), 
        StructField('id', StringType(), True), 
        StructField('json_endpoint', StringType(), True), 
        StructField('processed_at', StringType(), True), 
        StructField('raw_html_file', StringType(), True), 
        StructField('status', StringType(), True), 
        StructField('total_time_taken', DoubleType(), True)
    ]), True), 
    StructField('search_parameters', StructType([
        StructField('engine', StringType(), True), 
        StructField('google_domain', StringType(), True), 
        StructField('hl', StringType(), True), 
        StructField('q', StringType(), True), 
        StructField('type', StringType(), True)
    ]), True), 
    StructField('search_query', StringType(), True), 
    StructField('serpapi_pagination', StructType([
        StructField('next', StringType(), True)
    ]), True)
])

You can extract it using df.schema.

It is different from yours, because you also have some map type columns in your schema. If you need map columns, you can do it.

The following schema would also work:

schema = StructType([
    StructField('local_results', ArrayType(StructType([
        StructField('address', StringType(), True), 
        StructField('data_cid', StringType(), True), 
        StructField('data_id', StringType(), True), 
        StructField('gps_coordinates', MapType(StringType(), StringType()), True), 
        StructField('hours', StringType(), True), 
        StructField('open_state', StringType(), True), 
        StructField('operating_hours', MapType(StringType(), StringType()), True), 
        StructField('phone', StringType(), True), 
        StructField('photos_link', StringType(), True), 
        StructField('place_id', StringType(), True), 
        StructField('place_id_search', StringType(), True), 
        StructField('position', LongType(), True), 
        StructField('reviews_link', StringType(), True),
        StructField('thumbnail', StringType(), True), 
        StructField('title', StringType(), True), 
        StructField('type', StringType(), True), 
        StructField('unclaimed_listing', BooleanType(), True), 
        StructField('website', StringType(), True)
    ]), True), True), 
    StructField('search_information', MapType(StringType(), StringType()), True), 
    StructField('search_metadata', MapType(StringType(), StringType()), True), 
    StructField('search_parameters', MapType(StringType(), StringType()), True), 
    StructField('search_query', StringType(), True), 
    StructField('serpapi_pagination', MapType(StringType(), StringType()), True)
])
df = spark.read.option('multiline', 'true').json('file.json', schema)

For full flattening even the following would work:

schema = StructType([
    StructField('local_results', ArrayType(MapType(StringType(), StringType()), True), True), 
    StructField('search_information', MapType(StringType(), StringType()), True), 
    StructField('search_metadata', MapType(StringType(), StringType()), True), 
    StructField('search_parameters', MapType(StringType(), StringType()), True), 
    StructField('search_query', StringType(), True), 
    StructField('serpapi_pagination', MapType(StringType(), StringType()), True)
])
df = spark.read.option('multiline', 'true').json('file.json', schema)

Result:

df.show(truncate=0)
# +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------+------------+---------------------------------------------------------------------------------------------------------------------+
# |local_results                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |search_information                                                        |search_metadata                                                                                                                                                                                                                                                                                                                                                                                                          |search_parameters                                                                      |search_query|serpapi_pagination                                                                                                   |
# +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------+------------+---------------------------------------------------------------------------------------------------------------------+
# |[{position -> 1, title -> WH International Casting, LLC, place_id -> ChIJh0wvXcu_a4gRWuH-O1ltlPg, data_id -> 0x886bbfcb5d2f4c87:0xf8946d593bfee15a, data_cid -> 17912061847985381722, reviews_link -> https://serpapi.com/search.json?data_id=0x886bbfcb5d2f4c87%3A0xf8946d593bfee15a&engine=google_maps_reviews&hl=en, photos_link -> https://serpapi.com/search.json?data_id=0x886bbfcb5d2f4c87%3A0xf8946d593bfee15a&engine=google_maps_photos&hl=en, gps_coordinates -> {"latitude":38.295865,"longitude":-85.73001099999999}, place_id_search -> https://serpapi.com/search.json?data=%214m5%213m4%211s0x886bbfcb5d2f4c87%3A0xf8946d593bfee15a%218m2%213d38.295865%214d-85.73001099999999&engine=google_maps&google_domain=google.com&hl=en&type=place, unclaimed_listing -> true, type -> Warehouse, address -> 260 America Pl Dr, Jeffersonville, IN 47130, open_state -> Closed ⋅ Opens 8AM Mon, hours -> Closed ⋅ Opens 8AM Mon, operating_hours -> {"sunday":"Closed","monday":"8AM–4:30PM","tuesday":"8AM–4:30PM","wednesday":"8AM–4:30PM","thursday":"8AM–4:30PM","friday":"8AM–4:30PM","saturday":"Closed"}, phone -> (812) 725-8029, thumbnail -> https://lh5.googleusercontent.com/p/AF1QipPWDyyzxp1MG27vv3WVZbzy5WVI-Qh2u2jEDb-C=w122-h92-k-no}, {position -> 2, title -> W.H. Smith Manor, place_id -> ChIJ9584e22DXIgR5w2f2saKBOU, data_id -> 0x885c836d7b389ff7:0xe5048ac6da9f0de7, data_cid -> 16502467521268354535, reviews_link -> https://serpapi.com/search.json?data_id=0x885c836d7b389ff7%3A0xe5048ac6da9f0de7&engine=google_maps_reviews&hl=en, photos_link -> https://serpapi.com/search.json?data_id=0x885c836d7b389ff7%3A0xe5048ac6da9f0de7&engine=google_maps_photos&hl=en, gps_coordinates -> {"latitude":36.581589799999996,"longitude":-83.6581731}, place_id_search -> https://serpapi.com/search.json?data=%214m5%213m4%211s0x885c836d7b389ff7%3A0xe5048ac6da9f0de7%218m2%213d36.581589799999996%214d-83.6581731&engine=google_maps&google_domain=google.com&hl=en&type=place, unclaimed_listing -> true, type -> University department, address -> 184 Robertson Ave, Harrogate, TN 37752, open_state -> Closed ⋅ Opens 8AM Mon, hours -> Closed ⋅ Opens 8AM Mon, operating_hours -> {"sunday":"Closed","monday":"8AM–4:30PM","tuesday":"8AM–4:30PM","wednesday":"8AM–4:30PM","thursday":"8AM–4:30PM","friday":"8AM–4:30PM","saturday":"Closed"}, phone -> (423) 869-3611, website -> http://lmunet.edu/, thumbnail -> https://streetviewpixels-pa.googleapis.com/v1/thumbnail?panoid=mJwpOER-2yIbmD3xSwQ2pQ&cb_client=search.gws-prod.gps&w=80&h=92&yaw=307.97266&pitch=0&thumbfov=100}]|{local_results_state -> Results for exact spelling, query_displayed -> WH}|{id -> 63560cab66440a949ade5d72, status -> Success, json_endpoint -> https://serpapi.com/searches/b6986ff9ff715b13/63560cab66440a949ade5d72.json, created_at -> 2022-10-24 03:55:23 UTC, processed_at -> 2022-10-24 03:55:23 UTC, google_maps_url -> https://www.google.com/maps/search/WH?hl=en, raw_html_file -> https://serpapi.com/searches/b6986ff9ff715b13/63560cab66440a949ade5d72.html, total_time_taken -> 1.91}|{engine -> google_maps, type -> search, q -> WH, google_domain -> google.com, hl -> en}|WH.json     |{next -> https://serpapi.com/search.json?engine=google_maps&google_domain=google.com&hl=en&q=WH&start=20&type=search}|
# +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------+------------+---------------------------------------------------------------------------------------------------------------------+

But since all map values are forced to be strings, the inner objects cannot become maps. E.g. gps_coordinates result after the full flattening would look like this:

gps_coordinates -> {"latitude":38.295865,"longitude":-85.73001099999999}
ZygD
  • 22,092
  • 39
  • 79
  • 102
  • Thank you. What about this answer? Has it helped to solve your issue? – ZygD Nov 03 '22 at 14:08
  • My aim is to flatten out entire local_results, but currently, when I follow your schmea, I am only getting value([{260 America Pl .......) . but not the key associated. I don't know which column it belongs to.. How can I modify this? – Xi12 Nov 04 '22 at 01:28
  • @Xi12 - I have updated the answer with full flattening and its considerations. – ZygD Nov 04 '22 at 13:58
  • Thank you, a last question but how would you access gps_coordinates? – Xi12 Nov 04 '22 at 18:34
  • 1
    I would suggest exploding the column 'local_results', because its's of type *array of maps*. Exploded result should be saved as column before accessing it. Example: `df.withColumn('local_results', F.explode('local_results')).select(F.col('local_results')['gps_coordinates']).show()`. Otherwise, you would need to specify the index number of array element that you need, which may lead to errors, and it only selects one element: `df.select(F.col('local_results')[0]['gps_coordinates']).show()` – ZygD Nov 05 '22 at 05:48