More an idea than a real solution. Let's assume your data looks like this:
data = [
{"foobar":
{"foo": 1, "bar": 2, "fozbaz": {
"foz": 0, "baz": {"b": -1, "a": -1, "z": -1}
}}}]
import json
with open("foobar.json", "w") as fw:
for record in data:
fw.write(json.dumps(record))
First lets load it and check schema:
>>> srdd = sqlContext.jsonFile("foobar.json")
>>> srdd.printSchema()
root
|-- foobar: struct (nullable = true)
| |-- bar: integer (nullable = true)
| |-- foo: integer (nullable = true)
| |-- fozbaz: struct (nullable = true)
| | |-- baz: struct (nullable = true)
| | | |-- a: integer (nullable = true)
| | | |-- b: integer (nullable = true)
| | | |-- z: integer (nullable = true)
| | |-- foz: integer (nullable = true)
Now we register table as suggested by Justin Pihony and extract schema:
srdd.registerTempTable("srdd")
schema = srdd.schema().jsonValue()
Instead of flattening data we can flatten schema using something similar to this:
def flatten_schema(schema):
"""Take schema as returned from schema().jsonValue()
and return list of field names with full path"""
def _flatten(schema, path="", accum=None):
# Extract name of the current element
name = schema.get("name")
# If there is a name extend path
if name is not None:
path = "{0}.{1}".format(path, name) if path else name
# It is some kind of struct
if isinstance(schema.get("fields"), list):
for field in schema.get("fields"):
_flatten(field, path, accum)
elif isinstance(schema.get("type"), dict):
_flatten(schema.get("type"), path, accum)
# It is an atomic type
else:
accum.append(path)
accum = []
_flatten(schema, "", accum)
return accum
add small helper to format query string:
def build_query(schema, df):
select = ", ".join(
"{0} AS {1}".format(field, field.replace(".", "_"))
for field in flatten_schema(schema))
return "SELECT {0} FROM {1}".format(select, df)
and finally results:
>>> sqlContext.sql(build_query(schema, "srdd")).printSchema()
root
|-- foobar_bar: integer (nullable = true)
|-- foobar_foo: integer (nullable = true)
|-- foobar_fozbaz_baz_a: integer (nullable = true)
|-- foobar_fozbaz_baz_b: integer (nullable = true)
|-- foobar_fozbaz_baz_z: integer (nullable = true)
|-- foobar_fozbaz_foz: integer (nullable = true)
Disclaimer: I didn't try to get very deep into schema structure so most likely there are some cases not covered by flatten_schema
.