20

I have a structure holding 3d co-ordinates in 3 ints. In a test I've put together a List<> of 1 million random points and then used Binary serialization to a memory stream.

The memory stream is coming in a ~ 21 MB - which seems very inefficient as 1000000 points * 3 coords * 4 bytes should come out at 11MB minimum

Its also taking ~ 3 seconds on my test rig.

Any ideas for improving performance and/or size?

(I don't have to keep the ISerialzable interface if it helps, I could write out directly to a memory stream)

EDIT - From answers below I've put together a serialization showdown comparing BinaryFormatter, 'Raw' BinaryWriter and Protobuf

using System;
using System.Text;
using System.Collections.Generic;
using System.Linq;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using System.Runtime.Serialization;
using System.Runtime.Serialization.Formatters.Binary;
using System.IO;
using ProtoBuf;

namespace asp_heatmap.test
{
    [Serializable()] // For .NET BinaryFormatter
    [ProtoContract] // For Protobuf
    public class Coordinates : ISerializable
    {
        [Serializable()]
        [ProtoContract]
        public struct CoOrd
        {
            public CoOrd(int x, int y, int z)
            {
                this.x = x;
                this.y = y;
                this.z = z;
            }
            [ProtoMember(1)]            
            public int x;
            [ProtoMember(2)]
            public int y;
            [ProtoMember(3)]
            public int z;
        }

        internal Coordinates()
        {
        }

        [ProtoMember(1)]
        public List<CoOrd> Coords = new List<CoOrd>();

        public void SetupTestArray()
        {
            Random r = new Random();
            List<CoOrd> coordinates = new List<CoOrd>();
            for (int i = 0; i < 1000000; i++)
            {
                Coords.Add(new CoOrd(r.Next(), r.Next(), r.Next()));
            }
        }

        #region Using Framework Binary Formatter Serialization

        void ISerializable.GetObjectData(SerializationInfo info, StreamingContext context)
        {
            info.AddValue("Coords", this.Coords);
        }

        internal Coordinates(SerializationInfo info, StreamingContext context)
        {
            this.Coords = (List<CoOrd>)info.GetValue("Coords", typeof(List<CoOrd>));
        }

        #endregion

        # region 'Raw' Binary Writer serialization

        public MemoryStream RawSerializeToStream()
        {
            MemoryStream stream = new MemoryStream(Coords.Count * 3 * 4 + 4);
            BinaryWriter writer = new BinaryWriter(stream);
            writer.Write(Coords.Count);
            foreach (CoOrd point in Coords)
            {
                writer.Write(point.x);
                writer.Write(point.y);
                writer.Write(point.z);
            }
            return stream;
        }

        public Coordinates(MemoryStream stream)
        {
            using (BinaryReader reader = new BinaryReader(stream))
            {
                int count = reader.ReadInt32();
                Coords = new List<CoOrd>(count);
                for (int i = 0; i < count; i++)                
                {
                    Coords.Add(new CoOrd(reader.ReadInt32(),reader.ReadInt32(),reader.ReadInt32()));
                }
            }        
        }
        #endregion
    }

    [TestClass]
    public class SerializationTest
    {
        [TestMethod]
        public void TestBinaryFormatter()
        {
            Coordinates c = new Coordinates();
            c.SetupTestArray();

            // Serialize to memory stream
            MemoryStream mStream = new MemoryStream();
            BinaryFormatter bformatter = new BinaryFormatter();
            bformatter.Serialize(mStream, c);
            Console.WriteLine("Length : {0}", mStream.Length);

            // Now Deserialize
            mStream.Position = 0;
            Coordinates c2 = (Coordinates)bformatter.Deserialize(mStream);
            Console.Write(c2.Coords.Count);

            mStream.Close();
        }

        [TestMethod]
        public void TestBinaryWriter()
        {
            Coordinates c = new Coordinates();
            c.SetupTestArray();

            MemoryStream mStream = c.RawSerializeToStream();
            Console.WriteLine("Length : {0}", mStream.Length);

            // Now Deserialize
            mStream.Position = 0;
            Coordinates c2 = new Coordinates(mStream);
            Console.Write(c2.Coords.Count);
        }

        [TestMethod]
        public void TestProtoBufV2()
        {
            Coordinates c = new Coordinates();
            c.SetupTestArray();

            MemoryStream mStream = new MemoryStream();
            ProtoBuf.Serializer.Serialize(mStream,c);
            Console.WriteLine("Length : {0}", mStream.Length);

            mStream.Position = 0;
            Coordinates c2 = ProtoBuf.Serializer.Deserialize<Coordinates>(mStream);
            Console.Write(c2.Coords.Count);
        }
    }
}

Results (Note PB v2.0.0.423 beta)

                Serialize | Ser + Deserialize    | Size
-----------------------------------------------------------          
BinaryFormatter    2.89s  |      26.00s !!!      | 21.0 MB
ProtoBuf v2        0.52s  |       0.83s          | 18.7 MB
Raw BinaryWriter   0.27s  |       0.36s          | 11.4 MB

Obviously this is just looking at speed/size and doesn't take into account anything else.

Drew Noakes
  • 300,895
  • 165
  • 679
  • 742
Ryan
  • 23,871
  • 24
  • 86
  • 132
  • 1
    @Ryan, [this answer](http://stackoverflow.com/questions/703073/what-are-the-deficiencies-of-the-built-in-binaryformatter-based-net-serializatio/703361#703361) suggests using [protobuf-net](http://code.google.com/p/protobuf-net) for fast serialization. – bzlm Jun 25 '11 at 15:19
  • 1
    @Ryan and protobuf-net "v2" supports structs. Let me take a look later (not at a PC at the moment), but it is a definite option. – Marc Gravell Jun 25 '11 at 15:26
  • 1
    Binary serialization uses Reflection. That's slow but never a real issue because you use it for I/O. Why you serialize to memory is unguessable. – Hans Passant Jun 25 '11 at 15:26
  • Memory stream - because this will be stored in SQL database not file sys (but not 11MB I am using overly large list to emphasize perf problems) even if outputing to a file same problem applies - binary stream to file won't remove need for reflection if that is the cause. – Ryan Jun 25 '11 at 15:42
  • Wow - one vote to close as not a real question - wonder why? – Ryan Jun 25 '11 at 15:46
  • 1
    For clarification on the 18.7MB - this is being skewed by including the entire Int32 range, where-as protobuf *by default* optimises for smaller numbers. If you genuinely need 2^31 range, using fixed width may be preferred – Marc Gravell Jun 28 '11 at 05:27
  • Thanks a lot. I cannot deploy ProtoBuf v2 , because I'm constrained to use only MS pure .Net. Raw Binary Writer is over 50 times faster. – Diego Scaravaggi Mar 10 '14 at 17:04
  • @Diego - just to clarify that Marc's ProtoBuf is itself written in 'pure .NET'. Guess your bosses may still not allow the use of open source code in your projects though. – Ryan Mar 12 '14 at 10:23

2 Answers2

10

Binary serialisation using BinaryFormatter includes type information in the bytes it generates. This takes up additional space. It's useful in cases where you don't know what structure of data to expect at the other end, for example.

In your case, you know what format the data has at both ends, and that doesn't sound like it'd change. So you can write a simple encode and decode method. Your CoOrd class no longer needs to be serializable too.

I would use System.IO.BinaryReader and System.IO.BinaryWriter, then loop through each of your CoOrd instances and read/write the X,Y,Z propery values to the stream. Those classes will even pack your ints into less than 11MB, assuming many of your numbers are smaller than 0x7F and 0x7FFF.

Something like this:

using (var writer = new BinaryWriter(stream)) {
    // write the number of items so we know how many to read out
    writer.Write(points.Count);
    // write three ints per point
    foreach (var point in points) {
        writer.Write(point.X);
        writer.Write(point.Y);
        writer.Write(point.Z);
    }
}

To read from the stream:

List<CoOrd> points;
using (var reader = new BinaryReader(stream)) {
    var count = reader.ReadInt32();
    points = new List<CoOrd>(count);
    for (int i = 0; i < count; i++) {
        var x = reader.ReadInt32();
        var y = reader.ReadInt32();
        var z = reader.ReadInt32();
        points.Add(new CoOrd(x, y, z));
    }
}
Drew Noakes
  • 300,895
  • 165
  • 679
  • 742
  • 2
    "Binary serialisation includes type information in the bytes it generates" - no, `BinaryFormatter` does that. Binary serialization *in general* does no such thing. – Marc Gravell Jun 25 '11 at 20:54
  • 2
    Right, yes that's the point I was trying to get across. Binary serialisation _in general_ is a concept, not a technique. Will edit to clarify. – Drew Noakes Jun 26 '11 at 13:50
3

For simplicity of using a pre-build serializer, I recommend protobuf-net; here is protobuf-net v2, with just adding some attributes:

[DataContract]
public class Coordinates
{
    [DataContract]
    public struct CoOrd
    {
        public CoOrd(int x, int y, int z)
        {
            this.x = x;
            this.y = y;
            this.z = z;
        }
        [DataMember(Order = 1)]
        int x;
        [DataMember(Order = 2)]
        int y;
        [DataMember(Order = 3)]
        int z;
    }
    [DataMember(Order = 1)]
    public List<CoOrd> Coords = new List<CoOrd>();

    public void SetupTestArray()
    {
        Random r = new Random(123456);
        List<CoOrd> coordinates = new List<CoOrd>();
        for (int i = 0; i < 1000000; i++)
        {
            Coords.Add(new CoOrd(r.Next(10000), r.Next(10000), r.Next(10000)));
        }
    }
}

using:

ProtoBuf.Serializer.Serialize(mStream, c);

to serialize. This takes 10,960,823 bytes, but note that I tweaked SetupTestArray to limit the size to 10,000 since by default it uses "varint" encoding on the integers, which depends on the size. 10k isn't important here (in fact I didn't check what the "steps" are). If you prefer a fixed size (which will allow any range):

        [ProtoMember(1, DataFormat = DataFormat.FixedSize)]
        int x;
        [ProtoMember(2, DataFormat = DataFormat.FixedSize)]
        int y;
        [ProtoMember(3, DataFormat = DataFormat.FixedSize)]
        int z;

Which takes 16,998,640 bytes

Clueless
  • 1,190
  • 2
  • 14
  • 30
Marc Gravell
  • 1,026,079
  • 266
  • 2,566
  • 2,900
  • You're listing attributes DataContract and DataMember - should they be ProtoContract and ProtoMember or have I misunderstood? (have PB v2.0.0.404) – Ryan Jun 27 '11 at 23:14
  • @Ryan no mistake; it tries to be accomodating. It will use the Order from [DataMember], or from [XmlElement], to help transition from existing types. In particular, from LINQ-to-SQL. In v2 you don't even need attributes (you can tell it the bindings separately) – Marc Gravell Jun 27 '11 at 23:17
  • Also having trouble deserializing - Coordinates c2 = ProtoBuf.Serializer.Deserialize(mStream) - leaves c2.Coords null. I've put full source in Q edit. Have to admit not given enough time to RFM – Ryan Jun 27 '11 at 23:28
  • 1
    @Ryan - I can't repro that issue; I get 1M results right back. I might be a version ahead of you, of course... – Marc Gravell Jun 28 '11 at 05:35