5

Consider we've got a bunch of subtress that look like this:

subtree1 = {
    "id": "root",
    "children": [
        {
            "id": "file",
            "caption": "File",
            "children": []
        },
        {
            "id": "edit",
            "caption": "Edit",
            "children": []
        },
        {
            "id": "tools",
            "caption": "Tools",
            "children": [
                {
                    "id": "packages",
                    "caption": "Packages",
                    "children": []
                }
            ]
        },
        {
            "id": "help",
            "caption": "Help",
            "children": []
        },
    ]
}

subtree2 = {
    "id": "root",
    "children": [
        {
            "id": "file",
            "caption": "File",
            "children": [
                {"caption": "New"},
                {"caption": "Exit"},
            ]        
        }
    ]
}

subtree3 = {
    "id": "root",
    "children": [
        {
            "id": "edit",
            "children": [
                {"caption": "Copy"},
                {"caption": "Cut"},
                {"caption": "Paste"},
            ]
        },
        {
            "id": "help",
            "children": [
                {"caption": "About"},
            ]
        }
    ]
}

subtree4 = {
    "id": "root",
    "children": [
        {
            "id": "edit",
            "children": [
                {
                    "id": "text",
                    "caption": "Text",
                    "children": [
                        { "caption": "Insert line before" },
                        { "caption": "Insert line after" }
                    ]
                }
            ]
        }
    ]
}

I'm trying to figure how to code a merge function such as doing something like this:

tree0 = merge(subtree1, subtree2)
tree0 = merge(tree0, subtree3)
tree0 = merge(tree0, subtree4)

will produce:

tree0 = {
    "id": "root",
    "children": [
        {
            "id": "file",
            "caption": "File",
            "children": [
                {"caption": "New"},
                {"caption": "Exit"},
            ]   
        },
        {
            "id": "edit",
            "caption": "Edit",
            "children": [
                {"caption": "Copy"},
                {"caption": "Cut"},
                {"caption": "Paste"},
                {
                    "id": "text",
                    "caption": "Text",
                    "children": [
                        { "caption": "Insert line before" },
                        { "caption": "Insert line after" }
                    ]
                }
            ]
        },
        {
            "id": "tools",
            "caption": "Tools",
            "children": [
                {
                    "id": "packages",
                    "caption": "Packages",
                    "children": []
                }
            ]
        },
        {
            "id": "help",
            "caption": "Help",
            "children": [
                {"caption": "About"},
            ]
        },
    ]
}

but doing something like this:

tree1 = merge(subtree1, subtree2)
tree1 = merge(tree1, subtree4)
tree1 = merge(tree1, subtree3)

would produce:

tree1 = {
    "id": "root",
    "children": [
        {
            "id": "file",
            "caption": "File",
            "children": [
                {"caption": "New"},
                {"caption": "Exit"},
            ]   
        },
        {
            "id": "edit",
            "caption": "Edit",
            "children": [
                {
                    "id": "text",
                    "caption": "Text",
                    "children": [
                        { "caption": "Insert line before" },
                        { "caption": "Insert line after" }
                    ]
                },
                {"caption": "Copy"},
                {"caption": "Cut"},
                {"caption": "Paste"},
            ]
        },
        {
            "id": "tools",
            "caption": "Tools",
            "children": [
                {
                    "id": "packages",
                    "caption": "Packages",
                    "children": []
                }
            ]
        },
        {
            "id": "help",
            "caption": "Help",
            "children": [
                {"caption": "About"},
            ]
        },
    ]
}

Said otherwise, loading the subtrees in the same order will always produce the same tree but if you use the same list of subtrees in different order you're not guaranteed to produce the same tree (as children lists could be extended in different order).

I've already attempted to code this but I don't know how what's the merge algorithm behaviour and that's my question. Could anyone provide the code/pseudocode/explanation so I can implement it?

PS: Below you'll find some random attempt I thought could lead me to victory

if __name__ == '__main__':
    from collections import defaultdict

    subtree1 = {
        "id": "root",
        "children": [
            {
                "id": "file",
                "caption": "File",
                "children": []
            },
            {
                "id": "edit",
                "caption": "Edit",
                "children": []
            },
            {
                "id": "tools",
                "caption": "Tools",
                "children": [
                    {
                        "id": "packages",
                        "caption": "Packages",
                        "children": []
                    }
                ]
            },
            {
                "id": "help",
                "caption": "Help",
                "children": []
            },
        ]
    }

    subtree2 = {
        "id": "root",
        "children": [
            {
                "id": "file",
                "caption": "File",
                "children": [
                    {"caption": "New"},
                    {"caption": "Exit"},
                ]
            }
        ]
    }

    subtree3 = {
        "id": "root",
        "children": [
            {
                "id": "edit",
                "children": [
                    {"caption": "Copy"},
                    {"caption": "Cut"},
                    {"caption": "Paste"},
                ]
            },
            {
                "id": "help",
                "children": [
                    {"caption": "About"},
                ]
            }
        ]
    }

    subtree4 = {
        "id": "root",
        "children": [
            {
                "id": "edit",
                "children": [
                    {
                        "id": "text",
                        "caption": "Text",
                        "children": [
                            {"caption": "Insert line before"},
                            {"caption": "Insert line after"}
                        ]
                    }
                ]
            }
        ]
    }

    lst = [
        subtree1,
        subtree2,
        subtree3,
        subtree4
    ]

    def traverse(node, path=[]):
        yield node, tuple(path)

        for c in node.get("children", []):
            path.append(c.get("id", None))
            yield from traverse(c)
            path.pop()

    # Levels & Hooks
    dct_levels = defaultdict(list)
    dct_hooks = defaultdict(list)
    for subtree in lst:
        for n, p in traverse(subtree):
            if p not in dct_levels[len(p)]:
                dct_levels[len(p)].append(p)
            dct_hooks[p].append(n)

    print(dct_levels)
    print(dct_hooks[("file",)])

    # Merge should happen here
    tree = {
        "id": "root",
        "children": []
    }

    for level in range(1, max(dct_levels.keys()) + 1):
        print("populating level", level, dct_levels[level])

but not sure if I'm creating the right structures/helpers here as it's still unclear how the whole algorithm works... that's what this question is all about

BPL
  • 9,632
  • 9
  • 59
  • 117

3 Answers3

7

Tested with your examples on Python 3.5.

from copy import deepcopy


def merge(x: dict, y: dict) -> dict:
    'Merge subtrees x y, and return the results as a new tree.'
    return merge_inplace(deepcopy(x), y)


def merge_inplace(dest: dict, src: dict) -> dict:
    'Merge subtree src into dest, and return dest.'

    # perform sanity checks to make the code more rock solid
    # feel free to remove those lines if you don't need
    assert dest.get('id'), 'Cannot merge anonymous subtrees!'
    assert dest.get('id') == src.get('id'), 'Identity mismatch!'

    # merge attributes
    dest.update((k, v) for k, v in src.items() if k != 'children')

    # merge children
    if not src.get('children'):  # nothing to do, so just exit
        return dest
    elif not dest.get('children'):  # if the children list didn't exist
        dest['children'] = []  # then create an empty list for it

    named_dest_children = {
        child['id']: child
        for child in dest['children']
        if 'id' in child
    }
    for child in src['children']:
        if 'id' not in child:  # anonymous child, just append it
            dest['children'].append(child)
        elif child['id'] in named_dest_children:  # override a named subtree
            merge_inplace(named_dest_children[child['id']], child)
        else:  # create a new subtree
            dest['children'].append(child)
            named_dest_children[child['id']] = child
    return dest
Arnie97
  • 1,020
  • 7
  • 19
  • This answer is really good and I think it may already be used on real-case scenarios... There is a little case where I'm not sure what the output should be, check `test2` in this [code](https://dl.dropboxusercontent.com/s/wtdkzq4ak97diyp/2019-05-11_13-24-19.txt)... If I'm not mistaken on Sublime when an item has the same id it'll override new attributes, on that code the item `root/preferences` will end up having `"caption": "NewPreferences"` and `"commnad": "do_something"`. I'll check again on Sublime to be extra sure but i think that's what's gonna happen. That said, really nice answer, +1 – BPL May 11 '19 at 11:26
  • 2
    @BPL I've edited the answer to override the attributes - they could be copied in a oneliner simply if subtrees are not allowed in them – Arnie97 May 12 '19 at 16:56
1

You can use itertools.groupby with recursion:

from itertools import groupby
def merge(*args):
   if len(args) < 2 or any('id' not in i for i in args):
      return list(args)
   _d = [(a, list(b)) for a, b in groupby(sorted(args, key=lambda x:x['id']), key=lambda x:x['id'])]
   return [{**{j:k for h in b for j, k in h.items()}, 'id':a, 'children':merge(*[i for c in b for i in c['children']])} for a, b in _d]

Via args, this solution treats each passed dictionary as a member of a children list. This is to account for the possibility that two or more dictionaries may be passed to merge that have different ids i.e {'id':'root', 'children':[...]} and {'id':'root2', 'children':[...]}. As such, this solution would return a list of [{'id':'root', 'children':[...]}, {'id':'root2', 'children':[...]}], as the distinct ids do not provide an avenue for a match. As such, in the context of your current problem, you need to use indexing to access the single returned element of the resulting list: the merged dict with id 'root':

import json
tree0 = merge(subtree1, subtree2)[0]
tree0 = merge(tree0, subtree3)[0]
tree0 = merge(tree0, subtree4)[0]
print(json.dumps(tree0, indent=4))

Output:

{
  "id": "root",
  "children": [
    {
        "id": "edit",
        "caption": "Edit",
        "children": [
            {
                "caption": "Copy"
            },
            {
                "caption": "Cut"
            },
            {
                "caption": "Paste"
            },
            {
                "id": "text",
                "caption": "Text",
                "children": [
                    {
                        "caption": "Insert line before"
                    },
                    {
                        "caption": "Insert line after"
                    }
                ]
            }
        ]
    },
    {
        "id": "file",
        "caption": "File",
        "children": [
            {
                "caption": "New"
            },
            {
                "caption": "Exit"
            }
        ]
    },
    {
        "id": "help",
        "caption": "Help",
        "children": [
            {
                "caption": "About"
            }
        ]
    },
    {
        "id": "tools",
        "caption": "Tools",
        "children": [
            {
                "id": "packages",
                "caption": "Packages",
                "children": []
            }
        ]
      }
   ]
}
Ajax1234
  • 69,937
  • 8
  • 61
  • 102
  • Not everything should be golfed. 132 character line (without counting indentation) with a 5 for-loop list comprehension which has recursion and only single character variable names. This is just a horrible practise, which leads to unmaintainable code. – ruohola May 14 '19 at 16:22
  • Thanks for the answer, reason why I didn't validate/bountied this answer was because Arnie97 provided a nice solution first and community decided was a better answer... ruohola has a point although I love compact clever code and this one definitely is... so there you go, +1 ;) – BPL May 15 '19 at 07:14
0

Hand coding for merging JSON documents / objects may not be a optimum solution. DRY!
I've used genson, jsonschema and jsonmerge packages here for merging.

genson generates JSON Schema from JSON instance documents.
jsonschema validates JSON instance documents with JSON Schema.
jsonmerge merges objects / JSON documents by extending JSON Schema.

Let's first generate JSON Schema from JSON instances.

trees = (subtree1, subtree2, subtree3, subtree4)
schema_builder = genson.SchemaBuilder()
for tree in trees:
    schema_builder.add_object(tree)

schema = schema_builder.to_schema()

Now specify merge strategy.

schema['properties']['children']['mergeStrategy'] = 'arrayMergeById'
schema['properties']['children']['items']['properties']['children']['mergeStrategy'] = 'append'

arrayMergeById strategy merges objects by object's id property. append strategy collects objects in an array.
Here's full code;

import genson
import jsonmerge
import jsonschema

subtree1 = {
    "id":
    "root",
    "children": [
        {
            "id": "file",
            "caption": "File",
            "children": []
        },
        {
            "id": "edit",
            "caption": "Edit",
            "children": []
        },
        {
            "id": "tools",
            "caption": "Tools",
            "children": [{
                "id": "packages",
                "caption": "Packages",
                "children": []
            }]
        },
        {
            "id": "help",
            "caption": "Help",
            "children": []
        },
    ]
}

subtree2 = {
    "id":
    "root",
    "children": [{
        "id": "file",
        "caption": "File",
        "children": [
            {
                "caption": "New"
            },
            {
                "caption": "Exit"
            },
        ]
    }]
}

subtree3 = {
    "id":
    "root",
    "children": [{
        "id":
        "edit",
        "children": [
            {
                "caption": "Copy"
            },
            {
                "caption": "Cut"
            },
            {
                "caption": "Paste"
            },
        ]
    }, {
        "id": "help",
        "children": [
            {
                "caption": "About"
            },
        ]
    }]
}

subtree4 = {
    "id":
    "root",
    "children": [{
        "id":
        "edit",
        "children": [{
            "id":
            "text",
            "caption":
            "Text",
            "children": [{
                "caption": "Insert line before"
            }, {
                "caption": "Insert line after"
            }]
        }]
    }]
}

trees = (subtree1, subtree2, subtree3, subtree4)
schema_builder = genson.SchemaBuilder()
for tree in trees:
    schema_builder.add_object(tree)

schema = schema_builder.to_schema()
print("Validating schema...", end='')
for tree in trees:
    jsonschema.validate(tree, schema)
print(' done')
schema['properties']['children']['mergeStrategy'] = 'arrayMergeById'
schema['properties']['children']['items']['properties']['children']['mergeStrategy'] = 'append'

merger = jsonmerge.Merger(schema=schema)
tree = merger.merge(subtree1, subtree2)
tree = merger.merge(tree, subtree3)
tree = merger.merge(tree, subtree4)
print(tree)
Nizam Mohamed
  • 8,751
  • 24
  • 32