1

I'm moving my first steps into MongoDB but I'm having difficulties into writing complex queries.

I have several items like this in my collection:

{
    "_id" : "sku001",
    "deposits" : [ 
        {
            "deposit_id" : "deposit01",
            "total" : "3",
            "sizes" : [ 
                {
                    "size" : "36",
                    "stock" : "2"
                }, 
                {
                    "size" : "38",
                    "stock" : "0"
                }, 
                {
                    "size" : "40",
                    "stock" : "0"
                }, 
                {
                    "size" : "42",
                    "stock" : "0"
                }, 
                {
                    "size" : "44",
                    "stock" : "0"
                }, 
                {
                    "size" : "46",
                    "stock" : "1"
                }, 
                {
                    "size" : "48",
                    "stock" : "0"
                }, 
                {
                    "size" : "50",
                    "stock" : "0"
                }, 
                {
                    "size" : "52",
                    "stock" : "0"
                }
            ]
        }, 
        {
            "deposit_id" : "deposit02",
            "total" : "5",
            "sizes" : [ 
                {
                    "size" : "36",
                    "stock" : "1"
                }, 
                {
                    "size" : "38",
                    "stock" : "1"
                }, 
                {
                    "size" : "40",
                    "stock" : "0"
                }, 
                {
                    "size" : "42",
                    "stock" : "1"
                }, 
                {
                    "size" : "44",
                    "stock" : "0"
                }, 
                {
                    "size" : "46",
                    "stock" : "1"
                }, 
                {
                    "size" : "48",
                    "stock" : "1"
                }, 
                {
                    "size" : "50",
                    "stock" : "0"
                }, 
                {
                    "size" : "52",
                    "stock" : "0"
                }
            ]
        }, 
        {
            "deposit_id" : "deposit03",
            "total" : "2",
            "sizes" : [ 
                {
                    "size" : "36",
                    "stock" : "1"
                }, 
                {
                    "size" : "38",
                    "stock" : "0"
                }, 
                {
                    "size" : "40",
                    "stock" : "0"
                }, 
                {
                    "size" : "42",
                    "stock" : "1"
                }, 
                {
                    "size" : "44",
                    "stock" : "0"
                }, 
                {
                    "size" : "46",
                    "stock" : "0"
                }, 
                {
                    "size" : "48",
                    "stock" : "0"
                }, 
                {
                    "size" : "50",
                    "stock" : "0"
                }, 
                {
                    "size" : "52",
                    "stock" : "0"
                }
            ]
        }, 
        {
            "deposit_id" : "deposit04",
            "total" : "0",
            "sizes" : [ 
                {
                    "size" : "36",
                    "stock" : "0"
                }, 
                {
                    "size" : "38",
                    "stock" : "0"
                }, 
                {
                    "size" : "40",
                    "stock" : "0"
                }, 
                {
                    "size" : "42",
                    "stock" : "0"
                }, 
                {
                    "size" : "44",
                    "stock" : "0"
                }, 
                {
                    "size" : "46",
                    "stock" : "0"
                }, 
                {
                    "size" : "48",
                    "stock" : "0"
                }, 
                {
                    "size" : "50",
                    "stock" : "0"
                }, 
                {
                    "size" : "52",
                    "stock" : "0"
                }
            ]
        }
    ]
}

and would like to write a query which will output this resulting document:

{
    "_id" : "sku001",
    "total": 10,
    "sizes" [
        {
            "size" : "36",
            "stock" : "4"
        }, 
        {
            "size" : "38",
            "stock" : "1"
        }, 
        {
            "size" : "40",
            "stock" : "0"
        }, 
        {
            "size" : "42",
            "stock" : "2"
        }, 
        {
            "size" : "44",
            "stock" : "0"
        }, 
        {
            "size" : "46",
            "stock" : "2"
        }, 
        {
            "size" : "48",
            "stock" : "1"
        }, 
        {
            "size" : "50",
            "stock" : "0"
        }, 
        {
            "size" : "52",
            "stock" : "0"
        }
    ]
}

so that:

  • total is the sum the "total" field in each deposit
  • sizes is the resulting array given from the sum of the same size in each deposit

I know that I need to use aggregate() with $unwind, $group and maybe $project, but I don't understand how and in which order.

Can you help me? I'm using MongoDB version 3.4.3

Neil Lunn
  • 148,042
  • 36
  • 346
  • 317
comcas
  • 25
  • 4
  • 1
    The first step you'd want to do is parse the string values to numerical ones i.e. change the schema, then the appropriate aggregation pipeline arithmetic operators can be effective otherwise applying them on the current schema won't give you the correct results. – chridam Mar 30 '17 at 13:12
  • I didn't notice that stock counts were strings.. sorry my mistake. Anyway, i have rebuilt the schema and took care about having total and stock values set as integer and everything worked flawlessly even if I had to use `{allowDiskUse: true}` and the query took nearly 4 seconds to be executed. I'm wondering of what indexes should I use in order to optimize the aggregation query.... hints? – comcas Mar 31 '17 at 08:35
  • The issue is the way the schema is designed, you have nested arrays which may potentially cause performance issues in the aggregate pipeline when the `$unwind` operator is applied to denormalise/flatten the arrays. The reason why `$unwind` is not as efficient is that it produces a cartesian product of the documents i.e. a copy of each document per array entry, which uses more memory (possible memory cap on aggregation pipelines of 10% total memory) and therefore takes time to produce as well processing the documents during the flattening process. – chridam Mar 31 '17 at 08:49
  • Thanks for explanation. So, in order to represent such data, what would be a better design of this schema considering that I would like to obtain the same result? What should I keep in consideration when designing a schema? – comcas Mar 31 '17 at 11:59
  • I would say these answers pretty much address your schema design questions: [MongoDB relationships: embed or reference?](http://stackoverflow.com/a/5373969) and [Update embedded document in Mongodb: Performance issue?](http://stackoverflow.com/a/40157629) – chridam Mar 31 '17 at 12:40

1 Answers1

0

You may want to change your schema first before even applying the operators in the aggregation framework to produce your desired report i.e. the string values need to be parsed to numbers and then run the aggregate operation.

Changing the schema

To change the schema, you'd need to iterate the collection and for each document in the collection update the keys which need to have the string values parsed to numbers. You may want to create a temporary collection with the updated schema if it's not possible to change the original collection schema. For the former, consider running the following update operation (for relatively small collections):

db.collection.find({}).snapshot()
  .forEach(function(doc) {
        var query = { "_id": doc._id },
            update = { "$set": {} };

        for (var i=0; i<doc.deposits.length; i++) {         
            update["$set"]["deposits."+i+".total"] =  parseInt(doc.deposits[i].total);          
            for (var j=0; j<doc.deposits[i].sizes.length; j++) {                
                update["$set"]["deposits."+i+".sizes."+j+".stock"] = parseInt(doc.deposits[i].sizes[j].stock);                              
            }
        }

        printjson(query);
        printjson(update);

        db.collection.updateOne( query, update );
  });

Aggregate Pipeline

Having the correct schema will allow you to run the pipeline in the following aggregate operation which should give you the desired report:

db.collection.aggregate([
    { "$unwind": "$deposits" },
    { "$unwind": "$deposits.sizes" },                
    {
        "$group": {
            "_id": { 
                "id": "$_id",                   
                "size": "$deposits.sizes.size"
            },
            "total_stock": { "$sum": "$deposits.sizes.stock" },             
            "deposits": { 
                "$push": {
                    "deposit_id": "$deposits.deposit_id",
                    "total":  "$deposits.total" 
                }
            }
        }
    },
    { "$unwind": "$deposits" },
    {
        "$group": {
            "_id": { 
                "id": "$_id.id", 
                "deposit_id": "$deposits.deposit_id"
            },
            "sizes": { 
                "$push": {
                    "size": "$_id.size",
                    "stock": "$total_stock"
                }
            },
            "total": { "$first": "$deposits.total" }
        }
    },      
    {
        "$group": {
            "_id": "$_id.id",
            "total": { "$sum": "$total" },
            "sizes": { "$first": "$sizes" }    
        }
    }
])
chridam
  • 100,957
  • 23
  • 236
  • 235
  • The collection contains nearly 20000 items, which contain from 1 to 6 deposits each, and each deposit contains 1-10 sizes (depending on product type, and could contain dots in the name!). But everything is still in design state, so if there's a better way to handle such data let me know... I'm available to rewrite the schema from scratch... – comcas Mar 30 '17 at 15:59