66

This is not a terribly uncommon question, but I still couldn't seem to find an answer that really explained the choice.

I have a very large list of strings (ASCII representations of SHA-256 hashes, to be exact), and I need to query for the presence of a string within that list.

There will be what is likely in excess of 100 million entries in this list, and I will need to repeatably query for the presence of an entry many times.

Given the size, I doubt I can stuff it all into a HashSet<string>. What would be an appropriate retrieval system to maximize performance?

I CAN pre-sort the list, I CAN put it into a SQL table, I CAN put it into a text file, but I'm not sure what really makes the most sense given my application.

Is there a clear winner in terms of performance among these, or other methods of retrieval?

Grant H.
  • 3,689
  • 2
  • 35
  • 53
  • 2
    At first glance, since it needs to be searched, the preferred way would be to store it in a Sql table, but it really depends what this list is, if it's a one-time, immutable conversion kind of thing, if maintenance is required, etc, etc... – Crono Mar 13 '14 at 20:19
  • 1
    @Crono, it's more or less immutable, if the list needed to change, then we'd likely just tear down and then build up the table again. If using SQL, would a single column with a clustered index be my best bet, or is there something else I can do as well? – Grant H. Mar 13 '14 at 20:21
  • Sounds like something I would try, see if it works and if performance is acceptable. Sql Server also have more advanced text searching engine but I'm not very familiar with it. – Crono Mar 13 '14 at 20:28
  • 14
    Go with a "trie" - https://en.wikipedia.org/wiki/Trie. – Enigmativity Mar 13 '14 at 20:29
  • If you can sort the data, then you can use binary search on the list http://msdn.microsoft.com/en-us/library/w4e7fxsh(v=vs.110).aspx – Brian Rasmussen Mar 13 '14 at 20:50
  • 1
    You could just save the strings to a file and search it with Lucene.Net. – Jon Raynor Mar 13 '14 at 21:10
  • 25
    Does no one see the irony of using a ***`HashSet`*** to store ***`string`ed hashes?*** – recursion.ninja Mar 14 '14 at 18:35
  • 1
    Please could you explain how you came by this list of 100 million strings are and why you want to test the set for membership? Otherwise, you might be doing something stupid. For example, there are other ways to do license checking. – Colonel Panic Mar 16 '14 at 23:14
  • 7
    Why use a Hash to store and lookup data that is, by itself, a hash? SHA256 is 256 bits. Your 100M entries is so sparse that chance of collision in the same bucket is almost nill. Just take 32 bits (or some other number depending on your RAM) out of the entries and make a large vector array (containing references to the strings) for lookup. For collisions, just move to the next empty bucket. – Stephen Chung Mar 19 '14 at 04:58
  • @ColonelPanic: at least you see a legal use in it! When I started to wonder about it the only use-case I could think of was having a list of password hashes that 'needs' to be brute-forced by taking a string, salting & hashing it and then check if the result is used somewhere. But off course, maybe that tells more about me being overly paranoid than anything else =) – deroby Apr 09 '14 at 09:06
  • Nothing nefarious here, started as a simple security question at a meeting then became intrigued by how our test suite would work on a grand scale in .Net. Pretty neat results, the top answer is remarkably fast. – Grant H. Apr 09 '14 at 14:35

16 Answers16

62
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Security.Cryptography;

namespace HashsetTest
{
    abstract class HashLookupBase
    {
        protected const int BucketCount = 16;

        private readonly HashAlgorithm _hasher;

        protected HashLookupBase()
        {
            _hasher = SHA256.Create();
        }

        public abstract void AddHash(byte[] data);
        public abstract bool Contains(byte[] data);

        private byte[] ComputeHash(byte[] data)
        {
            return _hasher.ComputeHash(data);
        }

        protected Data256Bit GetHashObject(byte[] data)
        {
            var hash = ComputeHash(data);
            return Data256Bit.FromBytes(hash);
        }

        public virtual void CompleteAdding() { }
    }

    class HashsetHashLookup : HashLookupBase
    {
        private readonly HashSet<Data256Bit>[] _hashSets;

        public HashsetHashLookup()
        {
            _hashSets = new HashSet<Data256Bit>[BucketCount];

            for(int i = 0; i < _hashSets.Length; i++)
                _hashSets[i] = new HashSet<Data256Bit>();
        }

        public override void AddHash(byte[] data)
        {
            var item = GetHashObject(data);
            var offset = item.GetHashCode() & 0xF;
            _hashSets[offset].Add(item);
        }

        public override bool Contains(byte[] data)
        {
            var target = GetHashObject(data);
            var offset = target.GetHashCode() & 0xF;
            return _hashSets[offset].Contains(target);
        }
    }

    class ArrayHashLookup : HashLookupBase
    {
        private Data256Bit[][] _objects;
        private int[] _offsets;
        private int _bucketCounter;

        public ArrayHashLookup(int size)
        {
            size /= BucketCount;
            _objects = new Data256Bit[BucketCount][];
            _offsets = new int[BucketCount];

            for(var i = 0; i < BucketCount; i++) _objects[i] = new Data256Bit[size + 1];

            _bucketCounter = 0;
        }

        public override void CompleteAdding()
        {
            for(int i = 0; i < BucketCount; i++) Array.Sort(_objects[i]);
        }

        public override void AddHash(byte[] data)
        {
            var hashObject = GetHashObject(data);
            _objects[_bucketCounter][_offsets[_bucketCounter]++] = hashObject;
            _bucketCounter++;
            _bucketCounter %= BucketCount;
        }

        public override bool Contains(byte[] data)
        {
            var hashObject = GetHashObject(data);
            return _objects.Any(o => Array.BinarySearch(o, hashObject) >= 0);
        }
    }

    struct Data256Bit : IEquatable<Data256Bit>, IComparable<Data256Bit>
    {
        public bool Equals(Data256Bit other)
        {
            return _u1 == other._u1 && _u2 == other._u2 && _u3 == other._u3 && _u4 == other._u4;
        }

        public int CompareTo(Data256Bit other)
        {
            var rslt = _u1.CompareTo(other._u1);    if (rslt != 0) return rslt;
            rslt = _u2.CompareTo(other._u2);        if (rslt != 0) return rslt;
            rslt = _u3.CompareTo(other._u3);        if (rslt != 0) return rslt;

            return _u4.CompareTo(other._u4);
        }

        public override bool Equals(object obj)
        {
            if (ReferenceEquals(null, obj))
                return false;
            return obj is Data256Bit && Equals((Data256Bit) obj);
        }

        public override int GetHashCode()
        {
            unchecked
            {
                var hashCode = _u1.GetHashCode();
                hashCode = (hashCode * 397) ^ _u2.GetHashCode();
                hashCode = (hashCode * 397) ^ _u3.GetHashCode();
                hashCode = (hashCode * 397) ^ _u4.GetHashCode();
                return hashCode;
            }
        }

        public static bool operator ==(Data256Bit left, Data256Bit right)
        {
            return left.Equals(right);
        }

        public static bool operator !=(Data256Bit left, Data256Bit right)
        {
            return !left.Equals(right);
        }

        private readonly long _u1;
        private readonly long _u2;
        private readonly long _u3;
        private readonly long _u4;

        private Data256Bit(long u1, long u2, long u3, long u4)
        {
            _u1 = u1;
            _u2 = u2;
            _u3 = u3;
            _u4 = u4;
        }

        public static Data256Bit FromBytes(byte[] data)
        {
            return new Data256Bit(
                BitConverter.ToInt64(data, 0),
                BitConverter.ToInt64(data, 8),
                BitConverter.ToInt64(data, 16),
                BitConverter.ToInt64(data, 24)
            );
        }
    }

    class Program
    {
        private const int TestSize = 150000000;

        static void Main(string[] args)
        {
            GC.Collect(3);
            GC.WaitForPendingFinalizers();

            {
                var arrayHashLookup = new ArrayHashLookup(TestSize);
                PerformBenchmark(arrayHashLookup, TestSize);
            }

            GC.Collect(3);
            GC.WaitForPendingFinalizers();

            {
                var hashsetHashLookup = new HashsetHashLookup();
                PerformBenchmark(hashsetHashLookup, TestSize);
            }

            Console.ReadLine();
        }

        private static void PerformBenchmark(HashLookupBase hashClass, int size)
        {
            var sw = Stopwatch.StartNew();

            for (int i = 0; i < size; i++)
                hashClass.AddHash(BitConverter.GetBytes(i * 2));

            Console.WriteLine("Hashing and addition took " + sw.ElapsedMilliseconds + "ms");

            sw.Restart();
            hashClass.CompleteAdding();
            Console.WriteLine("Hash cleanup (sorting, usually) took " + sw.ElapsedMilliseconds + "ms");

            sw.Restart();
            var found = 0;

            for (int i = 0; i < size * 2; i += 10)
            {
                found += hashClass.Contains(BitConverter.GetBytes(i)) ? 1 : 0;
            }

            Console.WriteLine("Found " + found + " elements (expected " + (size / 5) + ") in " + sw.ElapsedMilliseconds + "ms");
        }
    }
}

Results are pretty promising. They run single-threaded. The hashset version can hit a little over 1 million lookups per second at 7.9GB RAM usage. The array-based version uses less RAM (4.6GB). Startup times between the two are nearly identical (388 vs 391 seconds). The hashset trades RAM for lookup performance. Both had to be bucketized because of memory allocation constraints.

Array performance:

Hashing and addition took 307408ms

Hash cleanup (sorting, usually) took 81892ms

Found 30000000 elements (expected 30000000) in 562585ms [53k searches per second]

======================================

Hashset performance:

Hashing and addition took 391105ms

Hash cleanup (sorting, usually) took 0ms

Found 30000000 elements (expected 30000000) in 74864ms [400k searches per second]

Bryan Boettcher
  • 4,412
  • 1
  • 28
  • 49
  • 1
    So, I gave this a shot last night, and it works like a dream! It takes about 20 minutes to load all of the data into memory (could have parallelized it, but was concerned the buffering required for this might put me over the edge), but once it's there, the query speed is fantastically fast. The memory usage is quite high (~9gb), but my 64-bit machine with 16 gigs of ram didn't mind it. – Grant H. Mar 14 '14 at 14:02
  • What is the purpose of using multiple hash sets? Also, because he's searching SHA hashes, each part of the hash should be sufficiently random to significantly simplify `GetHashCode()`. – Cory Nelson Mar 14 '14 at 14:09
  • 3
    Multiple hash sets is because one hash set OOMs at 93m records. An improvement can be made to the class by using the hash data to determine which bucket to drop the hash into. This may produce a more uneven storage distribution but lookups will go directly to the hash in question instead of trying all of them. All the equality parts were R#'s autogenerated ones. – Bryan Boettcher Mar 14 '14 at 14:15
  • Setting [](http://msdn.microsoft.com/en-us/library/hh285054(v=vs.110).aspx) in your app.config didn't let you make a larger hash set? – Jim Mischel Mar 14 '14 at 20:48
  • @insta, a million lookups a second. Wow, this is definitely the definitive answer for this question. Thank you for providing such a complete answer. – Grant H. Mar 14 '14 at 20:55
  • @JimMischel, I actually found I didn't need to set that when working in .Net 4. Someone else mentioned that as well I think. – Grant H. Mar 14 '14 at 20:56
  • @JimMischel: it is what got me to 93m records. It is set in the app.config – Bryan Boettcher Mar 14 '14 at 21:25
  • I was able to allocate an array of 150 million 32-bit items: over 4 gigabytes. You have to set `gcAllowVeryLargeObjects`, and you also have to make sure that if you're compiling with Any CPU, that you disable the "prefer 32 bit" in the project options. Unfortunately I can't test your `HashSet` solution with that many items because I only have 8 gigabytes of RAM. But it should work if you have enough RAM. But you'll need close to 16 GB free 'cause the hash set has to grow by copying. Unless you create the hash set from the array ... hmmmmm. – Jim Mischel Mar 14 '14 at 22:12
  • Is this a bloom filter? – Navin Mar 15 '14 at 06:00
  • @Navin: Neither is, at least explicitly. One is a hash lookup, the other is a binary search. – Bryan Boettcher Mar 18 '14 at 20:21
21

If the list changes over time, I would put it in a database.

If the list doesn't change, I would put it in a sorted file and do a binary search for every query.

In both cases, I would use a Bloom filter to minimize I/O. And I would stop using strings and use the binary representation with four ulongs (to avoid the object reference cost).

If you have more than 16 GB (2*64*4/3*100M, assuming Base64 encoding) to spare, an option is to make a Set&ltstring> and be happy. Of course it would fit in less than 7 GB if you use the binary representation.

David Haney's answer shows us that the memory cost is not so easily calculated.

Juan Lopes
  • 10,143
  • 2
  • 25
  • 44
  • 1
    Good idea to use a Bloom filter, but only use a it if there is a medium to high chance the value is not in the set. It can only provide the "certainly not" or "probably it is" answer to the question: "Is this value in the set?". If the answer is "probably it is in the set", then you still need to look it up to make sure it wasn't a false positive. – AutomatedChaos Mar 14 '14 at 09:15
17

With <gcAllowVeryLargeObjects>, you can have arrays that are much larger. Why not convert those ASCII representations of 256-bit hash codes to a custom struct that implements IComparable<T>? It would look like this:

struct MyHashCode: IComparable<MyHashCode>
{
    // make these readonly and provide a constructor
    ulong h1, h2, h3, h4;

    public int CompareTo(MyHashCode other)
    {
        var rslt = h1.CompareTo(other.h1);
        if (rslt != 0) return rslt;
        rslt = h2.CompareTo(other.h2);
        if (rslt != 0) return rslt;
        rslt = h3.CompareTo(other.h3);
        if (rslt != 0) return rslt;
        return h4.CompareTo(other.h4);
    }
}

You can then create an array of these, which would occupy approximately 3.2 GB. You can search it easy enough with Array.BinarySearch.

Of course, you'll need to convert the user's input from ASCII to one of those hash code structures, but that's easy enough.

As for performance, this isn't going to be as fast as a hash table, but it's certainly going to be faster than a database lookup or file operations.

Come to think of it, you could create a HashSet<MyHashCode>. You'd have to override the Equals method on MyHashCode, but that's really easy. As I recall, the HashSet costs something like 24 bytes per entry, and you'd have the added cost of the larger struct. Figure five or six gigabytes, total, if you were to use a HashSet. More memory, but still doable, and you get O(1) lookup.

Jim Mischel
  • 131,090
  • 20
  • 188
  • 351
15

These answers don't factor the string memory into the application. Strings are not 1 char == 1 byte in .NET. Each string object requires a constant 20 bytes for the object data. And the buffer requires 2 bytes per character. Therefore: the memory usage estimate for a string instance is 20 + (2 * Length) bytes.

Let's do some math.

  • 100,000,000 UNIQUE strings
  • SHA256 = 32 bytes (256 bits)
  • size of each string = 20 + (2 * 32 bytes) = 84 bytes
  • Total required memory: 8,400,000,000 bytes = 8.01 gigabytes

It is possible to do so, but this will not store well in .NET memory. Your goal should be to load all of this data into a form that can be accessed/paged without holding it all in memory at once. For that I'd use Lucene.net which will store your data on disk and intelligently search it. Write each string as searchable to an index and then search the index for the string. Now you have a scalable app that can handle this problem; your only limitation will be disk space (and it would take a lot of string to fill up a terabyte drive). Alternatively, put these records in a database and query against it. That's why databases exist: to persist things outside of RAM. :)

Haney
  • 32,775
  • 8
  • 59
  • 68
  • 9
    An SHA256 hash is 256 bits long, not 256 bytes. 32 bytes expressed as hexadecimal characters is 64 characters, or 128 bytes. Each string will take about 148 bytes, not 532 bytes. He should be able to fit all the strings into 11 or 12 gigabytes. By the way, if hashes were 256 bytes long, they would require 1024 bytes each (2 characters to encode a byte, times 2 bytes per character). – Jim Mischel Mar 13 '14 at 21:41
  • Jim - that's not how I understand the string representation memory cost. Can you elaborate on the 2 chars to encode x 2 bytes per char? – Haney Mar 14 '14 at 00:57
  • See Jon Skeet's answer re: string size: http://stackoverflow.com/a/3967458/2420979 Also see his updated blog post: http://msmvps.com/blogs/jon_skeet/archive/2011/04/05/of-memory-and-strings.aspx – Haney Mar 14 '14 at 01:00
  • I assumed that his ASCII representation of a 32-byte hash is in hex. For example, the bytes `0x1B, 0xEF` would be represented by the string `"1BEF"`. That's four characters to represent two bytes. Each character in that string requires two bytes (UTF-16). So representing those two bytes as a string requires 8 bytes, not counting the string allocation overhead. A 256 bit hash (32 bytes), then, requires 64 hex characters, which requires 128 bytes plus allocation overhead. – Jim Mischel Mar 14 '14 at 04:52
  • Using that calculation, each string would be 148 bytes long, resulting in a total of 14.8 billion bytes, or approximately 13.78 gigabytes. So I was low in my estimate of "11 or 12 gigabytes." – Jim Mischel Mar 14 '14 at 05:02
  • 2
    If you were going to store strings (pointless here since there's obviously a more compact representation of a 32-byte binary structure than the hexadecimal string thereof), then you would not necessarily store them as strings. A compact DAWG for example can often have cases where some insertions reduce total memory size. – Jon Hanna Mar 14 '14 at 12:45
  • Thanks for the explanation @JimMischel - You're discussing representing the strings as base-64 I believe. As such I get what you're saying. Either way, storing it all in memory is not necessarily optimal. – Haney Mar 14 '14 at 15:18
  • 3
    And actually, I bet this could be very efficiently represented with a Prefix Trie. In fact, I bet it would be stupidly efficient. – Haney Mar 14 '14 at 15:19
  • 1
    Actually, I'm discussing representing the strings as hexadecimal characters (using only the characters 0-9 and A-F). Base64 encoding would require 44 characters (although you could cut it to 43 because you know that the last character is irrelevant in this case) to represent 32 bytes. So if the hashes were represented as Base64 the strings would only be 86 bytes, plus allocation overhead. – Jim Mischel Mar 14 '14 at 20:55
  • 1
    @JonHanna I made a DAWG of around 30,000 random 64-character SHA256 hash strings using [this](https://github.com/rttlesnke/dawg-java). It's around 7 MB - at least 13 times bigger than the DAWG of the scrabble dictionary [TWL06](http://www.isc.ro/lists/twl06.zip), which has around 180,000 words. So a DAWG is probably not right for this task since the randomness makes it unusable. – max Mar 19 '14 at 11:16
  • 1
    @Max yes, thinking about it, there isn't going to be the large number of shared prefixes and suffixes that makes DAWGs work well with a lot of natural languages. – Jon Hanna Mar 19 '14 at 12:59
8

A hashset splits your data into buckets (arrays). On a 64-bit system, the size limit for an array is 2 GB, which is roughly 2,000,000,000 bytes.

Since a string is a reference type, and since a reference takes eight bytes (assuming a 64-bit system), each bucket can hold approximately 250,000,000 (250 million) references to strings. It seems to be way more than what you need.

That being said, as Tim S. pointed out, it's highly unlikely you'll have the necessary memory to hold the strings themselves, even though the references would fit into the hashset. A database would me a much better fit for this.

Peter Mortensen
  • 30,738
  • 21
  • 105
  • 131
dcastro
  • 66,540
  • 21
  • 145
  • 155
  • So how does the storage of the strings themselves factor in? Assuming a string size of around ~400 bytes, that only leaves room for around 4,000,000 strings in a 2GB segment, does it not? – Grant H. Mar 13 '14 at 20:35
  • I second that answer -- a hashset should work great, and the lookup performance should be something close to O(1). – JMarsch Mar 13 '14 at 20:38
  • 1
    @GrantH. It doesn't. The array doesn't store the string itself, it stores references to the strings. Imagine billions of stars scattered in the night sky, and then picture a line of people, each person pointing to an individual star. That line can't be longer than 250 million people. (Sorry, I got too excited watching the comeback of Cosmos). – dcastro Mar 13 '14 at 20:38
  • 3
    A SHA256 hash is 256 bytes. A base64 encoding (figured that's what's meant by "ASCII representations") means it takes ~341 chars. Each char in a string is represented by two bytes (UTF-16) in .Net, so ~682 bytes. 682 bytes * 100,000,000 ~= 63 TB. So unless you've got 64TB of memory, this is *way* too much data to keep in memory at once (regardless of how you reference it). – Tim S. Mar 13 '14 at 20:38
  • @GrantH. The string is a reference type, so only the 8 byte reference "counts" against your max size. Doesn't matter how long or short the string is. – JMarsch Mar 13 '14 at 20:39
  • Pointers actually *points* to somewhere, don't they? And that somewhere would happen to be an array of chars. With millions of them loaded at once, memory *will* be an issue. – Crono Mar 13 '14 at 20:40
  • That makes sense, thanks for the clarification. That said, @Tim S. is correct (thanks for the math correction) in that I most certainly will not have access to that kind of memory. – Grant H. Mar 13 '14 at 20:41
  • @TimS. That's a great point. In that case, you'll need to use a database. – dcastro Mar 13 '14 at 20:41
  • @TimS. Isn't that 63GB? Still a lot, and probably impractical to store in memory, but I the number is an order of magnitude off. – JMarsch Mar 13 '14 at 20:42
  • @JMarsch Ohhh you're right. I must've entered too many 0's somewhere. It's ~63GB. Too late for me to edit my comment. – Tim S. Mar 13 '14 at 20:45
  • All of the above is very wrong. A string in .NET is roughly `20 + (2 * Length) bytes` and it's not going to fit into memory nicely in many cases - see my answer. – Haney Mar 13 '14 at 21:02
  • @DavidHaney According to [Jon Skeet](http://msmvps.com/blogs/jon_skeet/archive/2011/04/05/of-memory-and-strings.aspx), it's `26 + 2n` (best source I could find, but if you have an official one, let me know). That might mean the calculations are a bit off by a couple of GBs - but the point stands, it's unfeasible. How is that "very wrong"? – dcastro Mar 13 '14 at 21:12
  • 1
    [There is no longer a 2GB limit](http://msdn.microsoft.com/en-us/library/hh285054.aspx) if you configure your app correctly. – Cory Nelson Mar 13 '14 at 21:41
  • 4
    An SHA256 hash is 256 *bits*, not bytes. He could fit all the strings in 11 or 12 megabytes. But that's a hugely expensive way of doing things. An array of 32-byte structs will take 3.2 gigs, which seems very reasonable. – Jim Mischel Mar 13 '14 at 21:55
8

For maximum speed, keep them in RAM. It's only ~3GB worth of data, plus whatever overhead your data structure needs. A HashSet<byte[]> should work just fine. If you want to lower overhead and GC pressure, turn on <gcAllowVeryLargeObjects>, use a single byte[], and a HashSet<int> with a custom comparer to index into it.

For speed and low memory usage, store them in a disk-based hash table. For simplicity, store them in a database.

Whatever you do, you should store them as plain binary data, not strings.

Cory Nelson
  • 29,236
  • 5
  • 72
  • 110
  • A `HashSet` is fairly expensive. Allocating an array requires something on the order of 50 bytes of overhead. So your overhead is larger than the data. Better off creating a `struct` of 4 `ulong` values.×Comments may only be edited for 5 minutes×Comments may only be edited for 5 minutes×Comments may only be edited for 5 minutes – Jim Mischel Mar 13 '14 at 22:19
7

It might take a while (1) to dump all the records in a (clustered indexed) table (preferably use their values, not their string representation (2)) and let SQL do the searching. It will handle binary searching for you, it will handle caching for you and it's probably the easiest thing to work with if you need to make changes to the list. And I'm pretty sure that querying things will be just as fast (or faster) than building your own.

(1): For loading the data have a look at the SqlBulkCopy object, things like ADO.NET or Entity Framework are going to be too slow as they load the data row by row.

(2): SHA-256 = 256 bits, so a binary(32) will do; which is only half of the 64 characters you're using now. (Or a quarter of it if you're using Unicode numbers =P) Then again, if you currently have the information in a plain text-file you could still go the char(64) way and simply dump the data in the table using bcp.exe. The database will be bigger, the queries slightly slower (as more I/O is needed + the cache holds only half of the information for the same amount of RAM), etc... But it's quite straightforward to do, and if you're not happy with the result you can still write your own database-loader.

Peter Mortensen
  • 30,738
  • 21
  • 105
  • 131
deroby
  • 5,902
  • 2
  • 19
  • 33
7

You need to be careful in this sort of situation as most collections in most languages are not really designed or optimized for that sort of scale. As you have already identified memory usage will be a problem too.

The clear winner here is to use some form of database. Either a SQL database or there are a number of NoSQL ones that would be appropriate.

The SQL server is already designed and optimized for keeping track of large amounts of data, indexing it and searching and querying across those indexes. It's designed for doing exactly what you are trying to do so really would be the best way to go.

For performance you could consider using an embedded database that will run within your process and save the resulting communications overhead. For Java I could recommend a Derby database for that purpose, I'm not aware of the C# equivalents enough to make a recommendation there but I imagine suitable databases exist.

Tim B
  • 40,716
  • 16
  • 83
  • 128
6

If the set is constant then just make a big sorted hash list (in raw format, 32 bytes each). Store all hashes so that they fit to disk sectors (4KB), and that the beginning of each sector is also the beginning of a hash. Save the first hash in every Nth sector in a special index list, which will easily fit into memory. Use binary search on this index list to determine the starting sector of a sector cluster where the hash should be, and then use another binary search within this sector cluster to find your hash. Value N should be determined based on measuring with test data.

EDIT: alternative would be to implement your own hash table on disk. The table should use open addressing strategy, and the probe sequence should be restricted to the same disk sector as much as possible. Empty slot have to be marked with a special value (all zeroes for instance) so this special value should be specially handled when queried for existence. To avoid collisions the table should not be less than 80% full with values, so in your case with 100 million entries with size of 32 bytes that means the table should have at least 100M/80%= 125 millions slots, and have the size of 125M*32= 4 GB. You only need to create the hashing function that would convert 2^256 domain to 125M, and some nice probe sequence.

Dialecticus
  • 16,400
  • 7
  • 43
  • 103
5

You can try a Suffix Tree, this question goes over how to do it in C#

Or you can try a search like so

var matches = list.AsParallel().Where(s => s.Contains(searchTerm)).ToList();

AsParallel will help speed things up as it creates a parallelization of a query.

Community
  • 1
  • 1
datatest
  • 483
  • 1
  • 5
  • 14
2
  1. Store your hashes as UInt32[8]

2a. Use sorted list. To compare two hashes, first compare their first elements; if they are equals, then compare second ones and so on.

2b. Use prefix tree

Kirill Gamazkov
  • 3,277
  • 1
  • 18
  • 22
1

First of all I would really recommend that you use data compression in order to minimize resource consumption. Cache and memory bandwidth are usually the most limited resource in a modern computer. No matter how you implement this the biggest bottleneck will be waiting for data.

Also I would recommend using an existing database engine. Many of them have build-in compression and any database would make use of the RAM you have available. If you have a decent operating system, the system cache will store as much of the file as it can. But most databases have their own caching subsystem.

I cant really tell what db engine will be best for you, you have to try them out. Personally I often use H2 which have decent performance and can be used as both in-memory and file-based database, and have build in transparent compression.

I see that some have stated that importing your data to a database and building the search index may take longer than some custom solution. That may be true but importing are usually something that's quite rare. I am going to assume that you are more interested in fast searches as they are probable to be the most common operation.

Also why SQL databases are both reliable and quite fast, you may want to consider NoSQL databases. Try out a few alternatives. The only way to know which solution will give you the best performance are by benchmarking them.

Also you should consider if storing your list as text makes sense. Perhaps you should convert the list to numeric values. That will use less space and therefore give you faster queries. Database import may be significantly slower, but queries may become significantly faster.

user1657170
  • 318
  • 2
  • 7
  • 1
    Can you really compress SHA hashes, which are effectively random strings? – Almo Mar 14 '14 at 19:13
  • Well, you can convert them to int array of size (256/8) = 32. Even if your hashes are encoded with Base64, you still have 33% overhead because each 8 bit character encodes only a 6 bit of your hash – Kirill Gamazkov Mar 15 '14 at 11:35
  • There's a typo in the comment above: if hash is represented as int array, then there are 8 integers in it – Kirill Gamazkov Mar 15 '14 at 11:45
  • If you use a string encoding that makes sense it will only use a subset of all available characters in order to be printable and readable. You don't really wanna use backspace or arrow characters in such a string. Also you do not compress the strings, you compress blocks of stored data which contains many strings. Compressing to small amounts of data almost always fail. – user1657170 Mar 16 '14 at 08:11
1

If you want really fast, and the elements are more or less immutable and require exact matches, you can build something that operates like a virus scanner: set the scope to collect the minimum number of potential elements using whatever algorithms are relevant to your entries and search criteria, then iterate through those items, testing against the search item using RtlCompareMemory.. You can pull the items from disk if they are fairly contiguous and compare using something like this:

    private Boolean CompareRegions(IntPtr hFile, long nPosition, IntPtr pCompare, UInt32 pSize)
    {
        IntPtr pBuffer = IntPtr.Zero;
        UInt32 iRead = 0;

        try
        {
            pBuffer = VirtualAlloc(IntPtr.Zero, pSize, MEM_COMMIT, PAGE_READWRITE);

            SetFilePointerEx(hFile, nPosition, IntPtr.Zero, FILE_BEGIN);
            if (ReadFile(hFile, pBuffer, pSize, ref iRead, IntPtr.Zero) == 0)
                return false;

            if (RtlCompareMemory(pCompare, pBuffer, pSize) == pSize)
                return true; // equal

            return false;
        }
        finally
        {
            if (pBuffer != IntPtr.Zero)
                VirtualFree(pBuffer, pSize, MEM_RELEASE);
        }
    }

I would modify this example to grab a large buffer full of entries, and loop through those. But managed code may not be the way to go.. Fastest is always closer to the calls that do the actual work, so a driver with kernel mode access built on straight C would be much faster..

JGU
  • 879
  • 12
  • 14
1

Firstly, you say the strings are really SHA256 hashes. Observe that 100 million * 256 bits = 3.2 gigabytes, so it is possible to fit the entire list in memory, assuming you use a memory-efficient data structure.

If you forgive occasional false positives, you can actually use less memory than that. See bloom filters http://billmill.org/bloomfilter-tutorial/

Otherwise, use a sorted data structure to achieve fast querying (time complexity O(log n)).


If you really do want to store the data in memory (because you're querying frequently and need fast results), try Redis. http://redis.io/

Redis is an open source, BSD licensed, advanced key-value store. It is often referred to as a data structure server since keys can contain strings, hashes, lists, sets and sorted sets.

It has a set datatype http://redis.io/topics/data-types#sets

Redis Sets are an unordered collection of Strings. It is possible to add, remove, and test for existence of members in O(1) (constant time regardless of the number of elements contained inside the Set).


Otherwise, use a database that saves the data on disk.

Colonel Panic
  • 132,665
  • 89
  • 401
  • 465
0

A plain vanilla binary search tree will give excellent lookup performance on large lists. However, if you don't really need to store the strings and simple membership is what you want to know, a Bloom Filter may be a terric solution. Bloom filters are a compact data structure that you train with all the strings. Once trained, it can quickly tell you if it has seen a string before. It rarely reports.false positives, but never reports false negatives. Depending on the application, they can produce amazing results quickly and with relatively little memory.

  • Perhaps you can support your answer with some examples and/or code fragments, along with explanation of how it would perform better then the `HashSet` the OP was considering. – Amber Mar 19 '14 at 14:14
0

I developed a solution similar to Insta's approach, but with some differences. In effect, it looks a lot like his chunked array solution. However, instead of just simply splitting the data, my approach builds an index of chunks and directs the search only to the appropriate chunk.

The way the index is built is very similar to a hashtable, with each bucket being an sorted array that can be search with a binary search. However, I figured that there's little point in computing a hash of an SHA256 hash, so instead I simply take a prefix of the value.

The interesting thing about this technique is that you can tune it by extending the length of the index keys. A longer key means a larger index and smaller buckets. My test case of 8 bits is probably on the small side; 10-12 bits would probably be more effective.

I attempted to benchmark this approach, but it quickly ran out of memory so I wasn't able to see anything interesting in terms of performance.

I also wrote a C implementation. The C implementation wasn't able to deal with a data set of the specified size either (the test machine has only 4GB of RAM), but it did manage somewhat more. (The target data set actually wasn't so much of a problem in that case, it was the test data that filled up the RAM.) I wasn't able to figure out a good way to throw data at it fast enough to really see its performance tested.

While I enjoyed writing this, I'd say overall it mostly provides evidence in favor of the argument that you shouldn't be trying to do this in memory with C#.

public interface IKeyed
{
    int ExtractKey();
}

struct Sha256_Long : IComparable<Sha256_Long>, IKeyed
{
    private UInt64 _piece1;
    private UInt64 _piece2;
    private UInt64 _piece3;
    private UInt64 _piece4;

    public Sha256_Long(string hex)
    {
        if (hex.Length != 64)
        {
            throw new ArgumentException("Hex string must contain exactly 64 digits.");
        }
        UInt64[] pieces = new UInt64[4];
        for (int i = 0; i < 4; i++)
        {
            pieces[i] = UInt64.Parse(hex.Substring(i * 8, 1), NumberStyles.HexNumber);
        }
        _piece1 = pieces[0];
        _piece2 = pieces[1];
        _piece3 = pieces[2];
        _piece4 = pieces[3];
    }

    public Sha256_Long(byte[] bytes)
    {
        if (bytes.Length != 32)
        {
            throw new ArgumentException("Sha256 values must be exactly 32 bytes.");
        }
        _piece1 = BitConverter.ToUInt64(bytes, 0);
        _piece2 = BitConverter.ToUInt64(bytes, 8);
        _piece3 = BitConverter.ToUInt64(bytes, 16);
        _piece4 = BitConverter.ToUInt64(bytes, 24);
    }

    public override string ToString()
    {
        return String.Format("{0:X}{0:X}{0:X}{0:X}", _piece1, _piece2, _piece3, _piece4);
    }

    public int CompareTo(Sha256_Long other)
    {
        if (this._piece1 < other._piece1) return -1;
        if (this._piece1 > other._piece1) return 1;
        if (this._piece2 < other._piece2) return -1;
        if (this._piece2 > other._piece2) return 1;
        if (this._piece3 < other._piece3) return -1;
        if (this._piece3 > other._piece3) return 1;
        if (this._piece4 < other._piece4) return -1;
        if (this._piece4 > other._piece4) return 1;
        return 0;
    }

    //-------------------------------------------------------------------
    // Implementation of key extraction

    public const int KeyBits = 8;
    private static UInt64 _keyMask;
    private static int _shiftBits;

    static Sha256_Long()
    {
        _keyMask = 0;
        for (int i = 0; i < KeyBits; i++)
        {
            _keyMask |= (UInt64)1 << i;
        }
        _shiftBits = 64 - KeyBits;
    }

    public int ExtractKey()
    {
        UInt64 keyRaw = _piece1 & _keyMask;
        return (int)(keyRaw >> _shiftBits);
    }
}

class IndexedSet<T> where T : IComparable<T>, IKeyed
{
    private T[][] _keyedSets;

    public IndexedSet(IEnumerable<T> source, int keyBits)
    {
        // Arrange elements into groups by key
        var keyedSetsInit = new Dictionary<int, List<T>>();
        foreach (T item in source)
        {
            int key = item.ExtractKey();
            List<T> vals;
            if (!keyedSetsInit.TryGetValue(key, out vals))
            {
                vals = new List<T>();
                keyedSetsInit.Add(key, vals);
            }
            vals.Add(item);
        }

        // Transform the above structure into a more efficient array-based structure
        int nKeys = 1 << keyBits;
        _keyedSets = new T[nKeys][];
        for (int key = 0; key < nKeys; key++)
        {
            List<T> vals;
            if (keyedSetsInit.TryGetValue(key, out vals))
            {
                _keyedSets[key] = vals.OrderBy(x => x).ToArray();
            }
        }
    }

    public bool Contains(T item)
    {
        int key = item.ExtractKey();
        if (_keyedSets[key] == null)
        {
            return false;
        }
        else
        {
            return Search(item, _keyedSets[key]);
        }
    }

    private bool Search(T item, T[] set)
    {
        int first = 0;
        int last = set.Length - 1;

        while (first <= last)
        {
            int midpoint = (first + last) / 2;
            int cmp = item.CompareTo(set[midpoint]);
            if (cmp == 0)
            {
                return true;
            }
            else if (cmp < 0)
            {
                last = midpoint - 1;
            }
            else
            {
                first = midpoint + 1;
            }
        }
        return false;
    }
}

class Program
{
    //private const int NTestItems = 100 * 1000 * 1000;
    private const int NTestItems = 1 * 1000 * 1000;

    private static Sha256_Long RandomHash(Random rand)
    {
        var bytes = new byte[32];
        rand.NextBytes(bytes);
        return new Sha256_Long(bytes);
    }

    static IEnumerable<Sha256_Long> GenerateRandomHashes(
        Random rand, int nToGenerate)
    {
        for (int i = 0; i < nToGenerate; i++)
        {
            yield return RandomHash(rand);
        }
    }

    static void Main(string[] args)
    {
        Console.WriteLine("Generating test set.");

        var rand = new Random();

        IndexedSet<Sha256_Long> set =
            new IndexedSet<Sha256_Long>(
                GenerateRandomHashes(rand, NTestItems),
                Sha256_Long.KeyBits);

        Console.WriteLine("Testing with random input.");

        int nFound = 0;
        int nItems = NTestItems;
        int waypointDistance = 100000;
        int waypoint = 0;
        for (int i = 0; i < nItems; i++)
        {
            if (++waypoint == waypointDistance)
            {
                Console.WriteLine("Test lookups complete: " + (i + 1));
                waypoint = 0;
            }
            var item = RandomHash(rand);
            nFound += set.Contains(item) ? 1 : 0;
        }

        Console.WriteLine("Testing complete.");
        Console.WriteLine(String.Format("Found: {0} / {0}", nFound, nItems));
        Console.ReadKey();
    }
}
Community
  • 1
  • 1
Nate C-K
  • 5,744
  • 2
  • 29
  • 45