What would be an efficient way to find the averaged size distribution without having to declare a gigantic 2D array?

Question

Consider a L x L size matrix M, whose entries can be either 0 or 1. Each element is 1 with probability p and 0 with probability 1 - p. I will call the elements labelled 1 as black elements and elements labelled 0 as white elements. I'm trying to write a code which:

Generates a random matrix with entries 0's and 1's. I need to input size of matrix L and the probability p.
Labels all the black elements belonging to the same cluster with the same number. (Define a cluster of black element as a maximal connected component in the graph of cells with the value of 1, where edges connect cells whose rows and columns both differ by at most 1 (so up to eight neighbours for each cell). In other words if two black elements of the matrix share an edge or a vertex consider them as belonging to the same black cluster. That is, think of the matrix as a large square and the elements as small squares in the large square.)

Within a loop that runs from p = 0 to p = 100 (I'm referring to probability p as a percentage): the number of black clusters of each size is written onto a file corresponding to that value of p.

For example:

Input: p = 30, L = 50

Output (which is written to a data file for each p; thus there are 101 data files created by this program, from p = 0 to p = 100):

1 100 (i.e. there are 100 black clusters of size 1)

2 20 (i.e. there are 20 black clusters of size 2)

3 15 (i.e. there are 15 black clusters of size 3)

and so on...

#include "stdafx.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <string.h>
int *labels;
int  n_labels = 0;

int uf_find(int x) {
    int y = x;
    while (labels[y] != y)
        y = labels[y];

    while (labels[x] != x) {
        int z = labels[x];
        labels[x] = y;
        x = z;
    }
    return y;
}

int uf_union(int x, int y) {
    return labels[uf_find(x)] = uf_find(y);
}

int uf_make_set(void) {
    labels[0] ++;
    assert(labels[0] < n_labels);
    labels[labels[0]] = labels[0];
    return labels[0];
}

void uf_initialize(int max_labels) {
    n_labels = max_labels;
    labels = (int*)calloc(sizeof(int), n_labels);
    labels[0] = 0;
}

void uf_done(void) {
    free(labels);
    labels = 0;
}

#define max(a,b) (a>b?a:b)
#define min(a,b) (a>b?b:a)

int hoshen_kopelman(int **matrix, int m, int n) {

    uf_initialize(m * n / 2);

    for (int i = 0; i<m; i++)
        for (int j = 0; j<n; j++)
            if (matrix[i][j]) {

                int up = (i == 0 ? 0 : matrix[i - 1][j]);
                int left = (j == 0 ? 0 : matrix[i][j - 1]);

                switch (!!up + !!left) {

                case 0:
                    matrix[i][j] = uf_make_set();
                    break;

                case 1:
                    matrix[i][j] = max(up, left);
                    break;

                case 2:
                    matrix[i][j] = uf_union(up, left);
                    break;
                }
                int north_west;
                if (i == 0 || j == 0)
                    north_west = 0;
                else
                    north_west = matrix[i - 1][j - 1];

                int north_east;
                if (i == 0 || j == (n - 1))
                    north_east = 0;
                else
                    north_east = matrix[i - 1][j + 1];

                if (!!north_west == 1)
                {
                    if (i != 0 && j != 0)
                    {
                        //if (rand() % 2 == 1)
                        //{
                            if (matrix[i][j - 1] == 0 && matrix[i - 1][j] == 0)
                            {
                                if (!!matrix[i][j] == 0)
                                    matrix[i][j] = north_west;
                                else
                                    uf_union(north_west, matrix[i][j]);
                            }
                        //}

                    }
                    else if (i == 0 || j == 0)
                    {

                    }

                }
                if (!!north_east == 1)
                {
                    if (i != 0 && j != n - 1)
                    {
                        //if (rand() % 2 == 1)
                        //{
                            if (matrix[i - 1][j] == 0 && matrix[i][j + 1] == 0)
                            {
                                if (!!matrix[i][j] == 0)
                                    matrix[i][j] = north_east;
                                else
                                    uf_union(north_east, matrix[i][j]);
                            }
                        //}
                    }
                    else if (i == 0 || j == n - 1)
                    {

                    }
                }
            }
    int *new_labels = (int*)calloc(sizeof(int), n_labels);

    for (int i = 0; i<m; i++)
        for (int j = 0; j<n; j++)
            if (matrix[i][j]) {
                int x = uf_find(matrix[i][j]);
                if (new_labels[x] == 0) {
                    new_labels[0]++;
                    new_labels[x] = new_labels[0];
                }
                matrix[i][j] = new_labels[x];
            }

    int total_clusters = new_labels[0];

    free(new_labels);
    uf_done();

    return total_clusters;
}

void check_labelling(int **matrix, int m, int n) {
    int N, S, E, W;
    for (int i = 0; i<m; i++)
        for (int j = 0; j<n; j++)
            if (matrix[i][j]) {
                N = (i == 0 ? 0 : matrix[i - 1][j]);
                S = (i == m - 1 ? 0 : matrix[i + 1][j]);
                E = (j == n - 1 ? 0 : matrix[i][j + 1]);
                W = (j == 0 ? 0 : matrix[i][j - 1]);

                assert(N == 0 || matrix[i][j] == N);
                assert(S == 0 || matrix[i][j] == S);
                assert(E == 0 || matrix[i][j] == E);
                assert(W == 0 || matrix[i][j] == W);
            }
}

int cluster_count(int **matrix, int size, int N)
{
    int i;
    int j;
    int count = 0;
    for (i = 0; i < size; i++)
    {
        for (j = 0; j < size; j++)
        {
            if (matrix[i][j] == N)
                count++;
        }
    }
    return count;
}

int main(int argc, char **argv)
{
    srand((unsigned int)time(0));
    int p = 0;
    printf("Enter number of rows/columns: ");
    int size = 0;
    scanf("%d", &size);
    printf("\n");
    FILE *fp;

    printf("Enter number of averaging iterations: ");
    int iterations = 0;

    scanf("%d", &iterations);

        for (int p = 0; p <= 100; p++)
        { 
            char str[100];
            sprintf(str, "BlackSizeDistribution%03i.txt", p);
            fp = fopen(str, "w");
            int **matrix;
            matrix = (int**)calloc(10, sizeof(int*));
            int** matrix_new = (int**)calloc(10, sizeof(int*));
            matrix_new = (int **)realloc(matrix, sizeof(int*) * size);
            matrix = matrix_new;
        for (int i = 0; i < size; i++)
        {
            matrix[i] = (int *)calloc(size, sizeof(int));
            for (int j = 0; j < size; j++)
            {
                int z = rand() % 100;
                z = z + 1;
                if (p == 0)
                    matrix[i][j] = 0;
                if (z <= p)
                    matrix[i][j] = 1;
                else
                    matrix[i][j] = 0;
            }
        }

        hoshen_kopelman(matrix, size, size);
        int highest = matrix[0][0];
        for (int i = 0; i < size; i++)
            for (int j = 0; j < size; j++)
                if (highest < matrix[i][j])
                    highest = matrix[i][j];
        int* counter = (int*)calloc(sizeof(int*), highest + 1);
        int high = 0;
        for (int k = 1; k <= highest; k++)
        {
            counter[k] = cluster_count(matrix, size, k);
            if (counter[k] > high)
                high = counter[k];
        }
        int* size_distribution = (int*)calloc(sizeof(int*), high + 1);

        for (int y = 1; y <= high; y++)
        {
           int count = 0;
           for (int z = 1; z <= highest; z++)
               if (counter[z] == y)
                   count++;
           size_distribution[y] = count;
        }

        for (int k = 1; k <= high; k++)
        {
            fprintf(fp, "\n%d\t%d", k, size_distribution[k]);
            printf("\n%d\t%d", k, size_distribution[k]);
        }
        printf("\n");
        check_labelling(matrix, size, size);
        for (int i = 0; i < size; i++)
            free(matrix[i]);
        free(matrix);
        }
}

I used the data files outputted by this program to create bar graphs for each p ranging from 0 to 100, using Gnuplot.

Some sample graphs:

This one corresponds to input p = 3 and L = 100

This one corresponds to p = 90 and L = 100

Okay, so I suppose that was enough context for the actual question I had.

The program I wrote only outputs value for one 1 iteration per value of p. Since this program is for a scientific computing purpose I need more accurate values and for that I need to output "averaged" size distribution value; over say 50 or 100 iterations. I'm not exactly sure how to do the averaging.

To be more clear this is what I want:

Say the output for three different runs of the program gives me (for say p = 3, L = 100)

Size    Iteration 1    Iteration 2    Iteration 3  Averaged Value (Over 3 iterations)

1       150            170             180         167

2       10             20              15          18

3       1              2               1           1

4       0              0               1           0  
.
.
.

I want to do something similar, except that I would need to perform the averaging over 50 or 100 iterations to get accurate values for my mathematical model. By the way, I don't really need to output the values for each iteration i.e. Iteration 1, Iteration 2, Iteration 3, ... in the data file. All I need to "print" is the first column and the last column (which has the averaged values) in the above table.

Now, I would probably need to modify the main function for that to do the averaging.

Here's the main function:

int main(int argc, char **argv)
    {
        srand((unsigned int)time(0));
        int p = 0;
        printf("Enter number of rows/columns: ");
        int size = 0;
        scanf("%d", &size);
        printf("\n");
        FILE *fp;

        printf("Enter number of averaging iterations: ");
        int iterations = 0;

        scanf("%d", &iterations);

            for (int p = 0; p <= 100; p++)
            { 
                char str[100];
                sprintf(str, "BlackSizeDistribution%03i.txt", p);
                fp = fopen(str, "w");
                int **matrix;
                matrix = (int**)calloc(10, sizeof(int*));
                int** matrix_new = (int**)calloc(10, sizeof(int*));
                matrix_new = (int **)realloc(matrix, sizeof(int*) * size);
                matrix = matrix_new;
            for (int i = 0; i < size; i++)
            {
                matrix[i] = (int *)calloc(size, sizeof(int));
                for (int j = 0; j < size; j++)
                {
                    int z = rand() % 100;
                    z = z + 1;
                    if (p == 0)
                        matrix[i][j] = 0;
                    if (z <= p)
                        matrix[i][j] = 1;
                    else
                        matrix[i][j] = 0;
                }
            }

            hoshen_kopelman(matrix, size, size);
            int highest = matrix[0][0];
            for (int i = 0; i < size; i++)
                for (int j = 0; j < size; j++)
                    if (highest < matrix[i][j])
                        highest = matrix[i][j];
            int* counter = (int*)calloc(sizeof(int*), highest + 1);
            int high = 0;
            for (int k = 1; k <= highest; k++)
            {
                counter[k] = cluster_count(matrix, size, k);
                if (counter[k] > high)
                    high = counter[k];
            }
            int* size_distribution = (int*)calloc(sizeof(int*), high + 1);

            for (int y = 1; y <= high; y++)
            {
               int count = 0;
               for (int z = 1; z <= highest; z++)
                   if (counter[z] == y)
                       count++;
               size_distribution[y] = count;
            }

            for (int k = 1; k <= high; k++)
            {
                fprintf(fp, "\n%d\t%d", k, size_distribution[k]);
                printf("\n%d\t%d", k, size_distribution[k]);
            }
            printf("\n");
            check_labelling(matrix, size, size);
            for (int i = 0; i < size; i++)
                free(matrix[i]);
            free(matrix);
            }
    }

One of the ways I thought of was declaring a 2D array of size (largest size of black cluster possible for that) x (number of averaging iterations I want), for each run of the p-loop, and at the end of the program extract the average size distribution for each p from the 2D array. The largest size of black cluster possible in a matrix of size 100 x 100 (say) is 10,000. But for say smaller values of p (say p = 1, p = 20, ...) such large size clusters are never even produced! So creating such a large 2D array in the beginning would be a terrible wastage of memory space and it would take days to execute! (Please keep in mind that I need to run this program for L = 1000 and L = 5000 and if possible even L = 10000 for my purpose)

There must be some other more efficient way to do this. But I don't know what. Any help is appreciated.

While computing the averages for one value of p, you need only the current n×n array being evaluated and one array of n*n elements to accumulate totals of component sizes, plus the workspace needed to compute the component sizes. There is no need for any array of three dimensions or more. You will want an efficient way to count clusters, but I suspect it can be done in one pass over the array (O(n×n) time). — Eric Postpischil, Apr 28 '18 at 10:40
@EricPostpischil Well, I know don't the maximum size of black cluster that will be formed for each iteration given a certain p, beforehand. So, how would I declare the size of the 2D array you're speaking of, initially ? If you have any neat idea in mind please consider writing an answer — , Apr 28 '18 at 10:49
@EricPostpischil Eh. That would be a terrible wastage of memory space for smaller probabilities p and also increase run-time. Say for p = 3 and n = 100, the largest cluster size that would be formed would be way smaller than 100*100. — , Apr 28 '18 at 10:55
@EricPostpischil I need to run the program for n = 1000. So that's not very helpful. It would severely affect the computation time. It's already taking over an hour to run in its current form. — , Apr 28 '18 at 10:58
Maybe I miss something, but won't a simple `map` do the job? Indexed by the cluster size, you store how may times that cluster size occurred. — geza, Apr 28 '18 at 11:17
@geza "indexed by cluster size" The thing is I don't know the maximum size of black cluster size that can occur for a certain p, beforehand. So can't declare size of map beforehand. — , Apr 28 '18 at 11:34
@Blue: you don't need to know, or am I missing something? Just add it into the map. The key is the cluster size, the value is the count. Just like, if you create a map with string keys, you don't know all the strings beforehand (similarly, you don't know all the possible cluster sizes beforehand, but it is not necessary). — geza, Apr 28 '18 at 12:07
@Blue, yeah, but you can find a map (hashtable) implementation for C too, I bet there's a lots of them — geza, Apr 28 '18 at 12:10
The cost of resetting an n*n array of counts for possible sizes is O(n*n), the same as the order of the cost of creating the n⨉n array of random elements (and with a lower constant, since no random number generator is used). This is not a large part of the compute time. — Eric Postpischil, Apr 28 '18 at 12:20
@Blue: or just go the way Eric recommends. for L=10000, a 10000x10000 32-bit int array is just 381.5MB, today's computers have a lot more memory. — geza, Apr 28 '18 at 12:24
@Blue: Using my approach, on an Intel i5-7200U laptop, running N=1000 iterations over L=1000 with p=25% takes less than 17 seconds. As to the results, there were 144376666 clusters of one, 144376666 of two, 8771364 of three, 2773646 of four, 912575 of five, 322752 of six, 116640 of seven, 43002 of eight, 16015 of nine, 5903 of ten, 2311 of eleven, 836 of twelve, 336 of thirteen, 145 of fourteen, 42 of fifteen, 16 of sixteen, 10 of seventeen, 2 of eighteen, and 1 twenty-one cell cluster. It is very clearly a negative exponential distribution. — Nominal Animal, Apr 28 '18 at 13:02
@NominalAnimal Strange. It took me considerably longer iirc. Did you reuse my code? May I see what addition you made to include the 1000 iterations ? You could paste it here: https://paste.ofcode.org — , Apr 28 '18 at 13:08
@Blue: No, of course not; I wrote mine from scratch, adapting my pseudocode to C. For N=1000, L=1000 (1000×1000 matrix), p=50%, the wall clock runtime does increase to 28 seconds or so (as the disjoint set operations take more time), and the distribution gets much more interesting, with a curve along the small sizes. The largest cluster contained 80 cells; there were single occurrences of clusters with 67, 68, 69, 71, 73, 74, 75, and 80 cells. — Nominal Animal, Apr 28 '18 at 13:09
@NominalAnimal Hmm, well I don't see how your code could be that much faster than mine considering that we used almost the same logic. As far the distribution you're talking about, I did get something similar (but I took only 1 iteration). Do you mind showing me your code btw? — , Apr 28 '18 at 13:14
@Blue: Added to my answer (and it is public domain), but do note I wrote it very quick, and it has not been verified for correct operation; not something I'd trust myself for scientific publication. The logic is sound, and I do stand behind it (so it's just that the code might have bugs or typos in it, and it's pretty ugly, being written quick in one sitting). — Nominal Animal, Apr 28 '18 at 13:19
@NominalAnimal Thanks. That's okay. I'm checking it. Will let you know if I spot something. You can always edit it. :) — , Apr 28 '18 at 13:21
@Blue: You tagged the question with C, but Visual Studio does not include a C compiler. It has a somewhat C++ compiler. Those errors are due to C++’s stricter type system. You can quiet them by inserting explicit reinterpret_cast operators (although writing proper C++ code would be preferable). — Eric Postpischil, Apr 28 '18 at 13:45
Since L is typicall less than 128, you can use a single SIMD (__m128i for intel) and store the values as bits. Then you can do most calculations a row at a time. For > 128, just use more — technosaurus, Apr 28 '18 at 14:29

Nominal Animal · Accepted Answer · 2018-04-29T15:01:37.000

Ah, a topic very close to my own heart! :)

Because you are collecting statistics, and the exact configuration is only interesting for the duration of collecting said statistics, you only need two L×L matrixes: one to hold the color information (one white, 0 to L×L blacks, so a type that can hold an integer between 0 and L²+1, inclusive); and the other to hold the number of elements of that type (an integer between 0 and L², inclusive).

To hold the number of black clusters of each size for each L and p value, across many iterations, you'll need a third array of L×L+1 elements; but do note that its values can reach N×L×L, if you do N iterations using the same L and p.

The standard C pseudorandom number generator is horrible; do not use that. Use either Mersenne Twister or my favourite, Xorshift64* (or the related Xorshift1024*). While Xorshift64* is extremely fast, its distribution is sufficiently "random" for simulations such as yours.

Instead of storing just "black" or "white", store either "uninteresting", or a cluster ID; initially, all (unconnected) blacks have an unique cluster ID. This way, you can implement a disjoint set and join the connected cells to clusters very efficiently.

Here is how I would implement this, in pseudocode:

Let id[L*L]        be the color information; index is L*row+col
Let idcount[L*L+1] be the id count over one iteration
Let count[L*L+1]   be the number of clusters across N iterations

Let limit = (p / 100.0) * PRNG_MAX

Let white = L*L    (and blacks are 0 to L*L-1, inclusive)
Clear count[] to all zeros

For iter = 1 to N, inclusive:

    For row = 0 to L-1, inclusive:

        If PRNG() <= limit:
            Let id[L*row] = L*row
        Else:
            Let id[L*row] = white
        End If

        For col = 1 to L-1, inclusive:
            If PRNG() <= limit:
                If id[L*row+col-1] == white:
                    Let id[L*row+col] = L*row + col
                Else:
                    Let id[L*row+col] = id[L*row+col-1]
                End If
            Else:
                Let id[L*row+col] = white
            End If
        End For

    End For

Note that I do the horizontal join pass when generating the cell ID's, simply by reusing the same ID for consecutive black cells. At this point, id[][] is now filled with black cells of various IDs at a probability of p percent, if PRNG() returns an unsigned integer between 0 and PRNG_MAX, inclusive.

Next, you do a pass of joins. Because horizontally connected cells are already joined, you need to do vertical, and the two diagonals. Do note that you should use path flattening when joining, for efficiency.

    For row = 0 to L-2, inclusive:
        For col = 0 to L-1, inclusive:
            If (id[L*row+col] != white) And (id[L*row+L+col] != white):
                Join id[L*row+col] and id[L*row+L+col]
            End If
        End For
    End For

    For row = 0 to L-2, inclusive:
        For col = 0 to L-2, inclusive:
            If (id[L*row+col] != white) And (id[L*row+L+col+1] != white):
                Join id[L*row+col] and id[L*row+L+col+1]
            End If
        End For
    End For

    For row = 1 to L-1, inclusive:
        For col = 0 to L-2, inclusive:
            If (id[L*row+col] != white) And (id[L*row-L+col+1] != white):
                Join id[L*row+col] and id[L*row-L+col+1]
            End If
        End For
    End For

Of course, you should combine loops, to maintain best possible locality of access. (Do col == 0 separately, and row == 0 in a separate inner loop, minimizing if clauses, as those tend to be slow.) You could even make the id array (L+1)² in size, filling the outer edge of cells with white, to simplify the above joins into a single double loop, at a cost of 4L+4 extra cells used.

At this point, you need to flatten each disjoint set. Each ID value that is not white is either L*row+col (meaning "this"), or a reference to another cell. Flattening means that for each ID, we find the final ID in the chain, and use that instead:

    For i = 0 to L*L-1, inclusive:
        If (id[i] != white):
            Let k = i
            While (id[k] != k):
                k = id[k]
            End While
            id[i] = k
        End If
    End For

Next, we need to count the number of cells that have a specific ID:

    Clear idn[]

    For i = 0 to L*L-1, inclusive:
        Increment idn[id[i]]
    End For

Because we are interested in the number of clusters of specific size over N iterations, we can simply update the count array by each cluster of specific size found in this iteration:

    For i = 1 to L*L, inclusive:
        Increment count[idn[i]]
    End For
    Let count[0] = 0

End For

At this point count[i] contains the number of black clusters of i cells found over N iterations; in other words, it is the histogram (discrete distribution) of the cluster sizes seen during the N iterations.

One implementation of the above could be as follows.

(The very first implementation had issues with the array sizes and cluster labels; this version splits that part into a separate file, and is easier to verify for correctness. Still, this is just the second version, so I do not consider it production-quality.)

First, the functions manipulating the matrix, cluster.h:

#ifndef   CLUSTER_H
#define   CLUSTER_H
#include <stdlib.h>
#include <inttypes.h>
#include <string.h>
#include <stdio.h>
#include <time.h>

/*
   This is in the public domain.
   Written by Nominal Animal <question@nominal-animal.net>
*/

typedef  uint32_t  cluster_label;
typedef  uint32_t  cluster_size;

static size_t  rows = 0;    /* Number of mutable rows */
static size_t  cols = 0;    /* Number of mutable columns */
static size_t  nrows = 0;   /* Number of accessible rows == rows + 1 */
static size_t  ncols = 0;   /* Number of accessible columns == cols + 1 */
static size_t  cells = 0;   /* Total number of cells == nrows*ncols */
static size_t  empty = 0;   /* Highest-numbered label == nrows*ncols - 1 */
static size_t  sizes = 0;   /* Number of possible cluster sizes == rows*cols + 1 */
#define  INDEX(row, col)  (ncols*(row) + (col))

cluster_label   *label  = NULL;  /* 2D contents: label[ncols*(row) + (col)] */
cluster_size    *occurs = NULL;  /* Number of occurrences of each label value */

/* Xorshift64* PRNG used for generating the matrix.
*/
static uint64_t  prng_state = 1;

static inline uint64_t  prng_u64(void)
{
    uint64_t  state = prng_state;
    state ^= state >> 12;
    state ^= state << 25;
    state ^= state >> 27;
    prng_state = state;
    return state * UINT64_C(2685821657736338717);
}

static inline uint64_t  prng_u64_less(void)
{
    uint64_t  result;
    do {
        result = prng_u64();
    } while (result == UINT64_C(18446744073709551615));
    return result;
}

static inline uint64_t  prng_seed(const uint64_t  seed)
{
    if (seed)
        prng_state = seed;
    else
        prng_state = 1;

    return prng_state;
}

static inline uint64_t  prng_randomize(void)
{
    int       discard = 1024;
    uint64_t  seed;
    FILE     *in;

    /* By default, use time. */
    prng_seed( ((uint64_t)time(NULL) * 2832631) ^
               ((uint64_t)clock() * 1120939) );

#ifdef __linux__
    /* On Linux, read from /dev/urandom. */
    in = fopen("/dev/urandom", "r");
    if (in) {
        if (fread(&seed, sizeof seed, 1, in) == 1)
            prng_seed(seed);
        fclose(in);        
    }
#endif

    /* Churn the state a bit. */
    seed = prng_state;
    while (discard-->0) {
        seed ^= seed >> 12;
        seed ^= seed << 25;
        seed ^= seed >> 27;
    }
    return prng_state = seed;
}




static inline void cluster_free(void)
{
    free(occurs); occurs = NULL;
    free(label);  label = NULL;
    rows = 0;
    cols = 0;
    nrows = 0;
    ncols = 0;
    cells = 0;
    empty = 0;
    sizes = 0;
}


static int cluster_init(const size_t want_cols, const size_t want_rows)
{
    cluster_free();

    if (want_cols < 1 || want_rows < 1)
        return -1; /* Invalid number of rows or columns */

    rows = want_rows;
    cols = want_cols;

    nrows = rows + 1;
    ncols = cols + 1;

    cells = nrows * ncols;

    if ((size_t)(cells / nrows) != ncols ||
        nrows <= rows || ncols <= cols)
        return -1; /* Size overflows */

    empty = nrows * ncols - 1;

    sizes = rows * cols + 1;

    label  = calloc(cells, sizeof label[0]);
    occurs = calloc(cells, sizeof occurs[0]);
    if (!label || !occurs) {
        cluster_free();
        return -1; /* Not enough memory. */
    }

    return 0;
}


static void join2(size_t i1, size_t i2)
{
    size_t  root1 = i1, root2 = i2, root;

    while (root1 != label[root1])
        root1 = label[root1];

    while (root2 != label[root2])
        root2 = label[root2];

    root = root1;
    if (root > root2)
        root = root2;

    while (label[i1] != root) {
        const size_t  temp = label[i1];
        label[i1] = root;
        i1 = temp;
    }

    while (label[i2] != root) {
        const size_t  temp = label[i2];
        label[i2] = root;
        i2 = temp;
    }
}

static void join3(size_t i1, size_t i2, size_t i3)
{
    size_t  root1 = i1, root2 = i2, root3 = i3, root;

    while (root1 != label[root1])
        root1 = label[root1];

    while (root2 != label[root2])
        root2 = label[root2];

    while (root3 != label[root3])
        root3 = label[root3];

    root = root1;
    if (root > root2)
        root = root2;
    if (root > root3)
        root = root3;

    while (label[i1] != root) {
        const size_t  temp = label[i1];
        label[i1] = root;
        i1 = temp;
    }

    while (label[i2] != root) {
        const size_t  temp = label[i2];
        label[i2] = root;
        i2 = temp;
    }

    while (label[i3] != root) {
        const size_t  temp = label[i3];
        label[i3] = root;
        i3 = temp;
    }
}

static void join4(size_t i1, size_t i2, size_t i3, size_t i4)
{
    size_t  root1 = i1, root2 = i2, root3 = i3, root4 = i4, root;

    while (root1 != label[root1])
        root1 = label[root1];

    while (root2 != label[root2])
        root2 = label[root2];

    while (root3 != label[root3])
        root3 = label[root3];

    while (root4 != label[root4])
        root4 = label[root4];

    root = root1;
    if (root > root2)
        root = root2;
    if (root > root3)
        root = root3;
    if (root > root4)
        root = root4;

    while (label[i1] != root) {
        const size_t  temp = label[i1];
        label[i1] = root;
        i1 = temp;
    }

    while (label[i2] != root) {
        const size_t  temp = label[i2];
        label[i2] = root;
        i2 = temp;
    }

    while (label[i3] != root) {
        const size_t  temp = label[i3];
        label[i3] = root;
        i3 = temp;
    }

    while (label[i4] != root) {
        const size_t  temp = label[i4];
        label[i4] = root;
        i4 = temp;
    }
}

static void cluster_fill(const uint64_t plimit)
{
    size_t  r, c;

    /* Fill the matrix with the specified probability.
       For efficiency, we use the same label for all
       horizontally consecutive cells.
    */
    for (r = 0; r < rows; r++) {
        const size_t  imax = ncols*r + cols;
        cluster_label last;
        size_t        i = ncols*r;

        last = (prng_u64_less() < plimit) ? i : empty;
        label[i] = last;

        while (++i < imax) {
            if (prng_u64_less() < plimit) {
                if (last == empty)
                    last = i;
            } else
                last = empty;

            label[i] = last;
        }

        label[imax] = empty;
    }

    /* Fill the extra row with empty, too. */
    {
        cluster_label       *ptr = label + ncols*rows;
        cluster_label *const end = label + ncols*nrows;
        while (ptr < end)
            *(ptr++) = empty;
    }

    /* On the very first row, we need to join non-empty cells
       vertically and diagonally down. */
    for (c = 0; c < cols; c++)
        switch ( ((label[c]         < empty) << 0) |
                 ((label[c+ncols]   < empty) << 1) |
                 ((label[c+ncols+1] < empty) << 2) ) {
            /*  <1>    *
             *   2  4  */
        case 3: /* Join down */
            join2(c, c+ncols); break;
        case 5: /* Join down right */
            join2(c, c+ncols+1); break;
        case 7: /* Join down and down right */
            join3(c, c+ncols, c+ncols+1); break;
        }

    /* On the other rows, we need to join non-empty cells
       vertically, diagonally down, and diagonally up. */
    for (r = 1; r < rows; r++) {
        const size_t  imax = ncols*r + cols;
        size_t        i;
        for (i = ncols*r; i < imax; i++)
            switch ( ((label[i]         < empty) << 0) |
                     ((label[i-ncols+1] < empty) << 1) |
                     ((label[i+ncols]   < empty) << 2) |
                     ((label[i+ncols+1] < empty) << 3) ) {
            /*      2  *
             *  <1>    *
             *   4  8  */
            case 3: /* Diagonally up */
                join2(i, i-ncols+1); break;
            case 5: /* Down */
                join2(i, i+ncols); break;
            case 7: /* Down and diagonally up */
                join3(i, i-ncols+1, i+ncols); break;
            case 9: /* Diagonally down */
                join2(i, i+ncols+1); break;
            case 11: /* Diagonally up and diagonally down */
                join3(i, i-ncols+1, i+ncols+1); break;
            case 13: /* Down and diagonally down */
                join3(i, i+ncols, i+ncols+1); break;
            case 15: /* Down, diagonally down, and diagonally up */
                join4(i, i-ncols+1, i+ncols, i+ncols+1); break;
            }
    }

    /* Flatten the labels, so that all cells belonging to the
       same cluster will have the same label. */
    {
        const size_t  imax = ncols*rows;
        size_t        i;
        for (i = 0; i < imax; i++)
            if (label[i] < empty) {
                size_t  k = i, kroot = i;

                while (kroot != label[kroot])
                    kroot = label[kroot];

                while (label[k] != kroot) {
                    const size_t  temp = label[k];
                    label[k] = kroot;
                    k = temp;
                }
            }
    }

    /* Count the number of occurrences of each label. */
    memset(occurs, 0, (empty + 1) * sizeof occurs[0]);
    {
        cluster_label *const end = label + ncols*rows;
        cluster_label       *ptr = label;

        while (ptr < end)
            ++occurs[*(ptr++)];
    }
}

#endif /* CLUSTER_H */

Note that to get full probability range, the prng_u64_less() function never returns the highest possible uint64_t. It is way overkill, precision-wise, though.

The main.c parses the input parameters, iterates, and prints the results:

#include <stdlib.h>
#include <inttypes.h>
#include <stdio.h>
#include "cluster.h"

/*
   This program is in the public domain.
   Written by Nominal Animal <question@nominal-animal.net>
*/

int main(int argc, char *argv[])
{
    uint64_t      *count = NULL;
    uint64_t       plimit;
    unsigned long  size, iterations, i;
    double         probability;
    char           dummy;

    if (argc != 4) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s SIZE ITERATIONS PROBABILITY\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "Where  SIZE         sets the matrix size (SIZE x SIZE),\n");
        fprintf(stderr, "       ITERATIONS   is the number of iterations, and\n");
        fprintf(stderr, "       PROBABILITY  is the probability [0, 1] of a cell\n");
        fprintf(stderr, "                    being non-empty.\n");
        fprintf(stderr, "\n");
        return EXIT_FAILURE;
    }

    if (sscanf(argv[1], " %lu %c", &size, &dummy) != 1 || size < 1u) {
        fprintf(stderr, "%s: Invalid matrix size.\n", argv[1]);
        return EXIT_FAILURE;
    }

    if (sscanf(argv[2], " %lu %c", &iterations, &dummy) != 1 || iterations < 1u) {
        fprintf(stderr, "%s: Invalid number of iterations.\n", argv[2]);
        return EXIT_FAILURE;
    }

    if (sscanf(argv[3], " %lf %c", &probability, &dummy) != 1 || probability < 0.0 || probability > 1.0) {
        fprintf(stderr, "%s: Invalid probability.\n", argv[3]);
        return EXIT_FAILURE;
    }

    /* Technically, we want plimit = (uint64_t)(0.5 + 18446744073709551615.0*p),
       but doubles do not have the precision for that. */
    if (probability > 0.9999999999999999)
        plimit = UINT64_C(18446744073709551615);
    else
    if (probability <= 0)
        plimit = UINT64_C(0);
    else
        plimit = (uint64_t)(18446744073709551616.0 * probability);

    /* Try allocating memory for the matrix and the label count array. */
    if (size > 2147483646u || cluster_init(size, size)) {
        fprintf(stderr, "%s: Matrix size is too large.\n", argv[1]);
        return EXIT_FAILURE;
    }

    /* Try allocating memory for the cluster size histogram. */
    count = calloc(sizes + 1, sizeof count[0]);
    if (!count) {
        fprintf(stderr, "%s: Matrix size is too large.\n", argv[1]);
        return EXIT_FAILURE;
    }

    printf("# Xorshift64* seed is %" PRIu64 "\n", prng_randomize());
    printf("# Matrix size is %lu x %lu\n", size, size);
    printf("# Probability of a cell to be non-empty is %.6f%%\n",
           100.0 * ((double)plimit / 18446744073709551615.0));
    printf("# Collecting statistics over %lu iterations\n", iterations);
    printf("# Size Count CountPerIteration\n");
    fflush(stdout);

    for (i = 0u; i < iterations; i++) {
        size_t  k = empty;

        cluster_fill(plimit);

        /* Add cluster sizes to the histogram. */
        while (k-->0)
            ++count[occurs[k]];
    }

    /* Print the histogram of cluster sizes. */
    {
        size_t k = 1;

        printf("%zu %" PRIu64 " %.3f\n", k, count[k], (double)count[k] / (double)iterations);

        for (k = 2; k < sizes; k++)
            if (count[k-1] != 0 || count[k] != 0 || count[k+1] != 0)
                printf("%zu %" PRIu64 " %.3f\n", k, count[k], (double)count[k] / (double)iterations);
    }

    return EXIT_SUCCESS;
}

The program outputs the cluster size distribution (histogram) in Gnuplot-compatible format, beginning with a few comment lines starting with # to indicate the exact parameters used to compute the results.

The first column in the output is the cluster size (number of cells in a cluster), the second column is the exact count of such clusters found during the iterations, and the third column is the observed probability of occurrence (i.e., total count of occurrences divided by the number of iterations).

Here is what the cluster size distribution for L=1000, N=1000 looks like for a few different values of p: Note that both axes have logarithmic scaling, so it is a log-log plot. It was generated by running the above program a few times, once for each unique p, and saving the output to a file (name containing the value of p), then printing them into a single plot in Gnuplot.

For verification of the clustering functions, I wrote a test program that generates one matrix, assigns random colors to each cluster, and outputs it as an PPM image (that you can manipulate with NetPBM tools, or basically any image manipulation program); ppm.c:

#include <stdlib.h>
#include <inttypes.h>
#include <stdio.h>
#include "cluster.h"

/*
   This program is in the public domain.
   Written by Nominal Animal <question@nominal-animal.net>
*/

int main(int argc, char *argv[])
{
    uint64_t       plimit;
    uint32_t      *color;
    unsigned long  size, r, c;
    double         probability;
    char           dummy;

    if (argc != 3) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s SIZE PROBABILITY > matrix.ppm\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "Where  SIZE         sets the matrix size (SIZE x SIZE),\n");
        fprintf(stderr, "       PROBABILITY  is the probability [0, 1] of a cell\n");
        fprintf(stderr, "                    being non-empty.\n");
        fprintf(stderr, "\n");
        return EXIT_FAILURE;
    }

    if (sscanf(argv[1], " %lu %c", &size, &dummy) != 1 || size < 1u) {
        fprintf(stderr, "%s: Invalid matrix size.\n", argv[1]);
        return EXIT_FAILURE;
    }

    if (sscanf(argv[2], " %lf %c", &probability, &dummy) != 1 || probability < 0.0 || probability > 1.0) {
        fprintf(stderr, "%s: Invalid probability.\n", argv[2]);
        return EXIT_FAILURE;
    }

    /* Technically, we want plimit = (uint64_t)(0.5 + 18446744073709551615.0*p),
       but doubles do not have the precision for that. */
    if (probability > 0.9999999999999999)
        plimit = UINT64_C(18446744073709551615);
    else
    if (probability <= 0)
        plimit = UINT64_C(0);
    else
        plimit = (uint64_t)(18446744073709551616.0 * probability);

    /* Try allocating memory for the matrix and the label count array. */
    if (size > 2147483646u || cluster_init(size, size)) {
        fprintf(stderr, "%s: Matrix size is too large.\n", argv[1]);
        return EXIT_FAILURE;
    }

    /* Try allocating memory for the color lookup array. */
    color = calloc(empty + 1, sizeof color[0]);
    if (!color) {
        fprintf(stderr, "%s: Matrix size is too large.\n", argv[1]);
        return EXIT_FAILURE;
    }

    fprintf(stderr, "Using Xorshift64* seed %" PRIu64 "\n", prng_randomize());
    fflush(stderr);

    /* Construct the matrix. */
    cluster_fill(plimit);

    {
        size_t  i;

        color[empty] = 0xFFFFFFu;

        /* Assign random colors. */
        for (i = 0; i < empty; i++)
            color[i] = prng_u64() >> 40;
    }

    printf("P6\n%lu %lu 255\n", size, size);
    for (r = 0; r < rows; r++)
        for (c = 0; c < cols; c++) {
            const uint32_t  rgb = color[label[ncols*r + c]];
            fputc((rgb >> 16) & 255, stdout);
            fputc((rgb >> 8)  & 255, stdout);
            fputc( rgb        & 255, stdout);
        }
    fflush(stdout);

    fprintf(stderr, "Done.\n");

    return EXIT_SUCCESS;
}

Here is what a 256x256 matrix at p=40% looks like: 256x256 matrix, p=40%

Note that the white background is not counted as a cluster.

Let us [continue this discussion in chat](https://chat.stackoverflow.com/rooms/169996/discussion-between-blue-and-nominal-animal). — , Apr 28 '18 at 16:16
@Blue: I wrote the example program, and this version should work much better (and be easier to verify for correct operation). — Nominal Animal, Apr 29 '18 at 14:34
Sorry for disturbing after so long, but I was trying to implement something more to the program you wrote. Your program only calculates the number of clusters of the black pixels of different sizes (where each pixel has a probability p of being black). I was trying to change the `cluster.h` file a bit to even calculate the number of clusters of different sizes of the remaining white pixels. Any idea what would be the most efficient way to do it? Is it possible to do it alongside calculating the size histogram for the black pixels? — , Jun 16 '18 at 05:33
@Blue: I sent you a preliminary implementation via email, but I am still letting my unconscious brew on that. The algorithm itself is clear (a disjoint set, where the elements are joined by desired probability), but an efficient implementation is an interesting problem; especially since one wants separate histograms of black and white clusters. (That means that one needs to attach the color information to the roots of the disjoint set). I suspect that a separate color matrix, with disjoint set ID's being matrix cell indexes, might be more performant and easier to verify. — Nominal Animal, Jun 19 '18 at 22:32
Hey, thanks. I received the program, but was having trouble running it. What are the command line arguments (for distribution.c) I need to give to run it? — , Jun 20 '18 at 06:44
For example, `./distribution L=100 p=0.3 d1=0.6 n=100000` for a 100x100 matrix with 30% nonzero cells, 60% probability that nonzero cells will connect diagonally, collecting statistics over 100000 matrices. Each line in the output contains cluster size, number of nonzero clusters of that size, number of zero clusters of that size, and total number of clusters of that size (so fourth is sum of second and third). Lines that begin with # are comments. In Gnuplot, try `plot "output" u 1:2 t "1" w lines, "" u 1:3 t "0" w lines, "" u 1:4 t "0+1" w lines lc -1`. — Nominal Animal, Jun 20 '18 at 14:03
The program is causing some weird errors. For the same input L=100 p=0.5 d0=0.3 d1=0.7 it sometimes shows the output and sometimes it doesn't. For L=1000 p=0.5 d0=0.3 d1=0.7 it doesn't show the output at all. Any idea what's happening? I'm using Visual Studio. Is the "fflush" causing any problem? — , Jun 21 '18 at 19:18
I started a Q&A thread about the problem, [here](https://stackoverflow.com/questions/50976732/different-behaviors-during-c-program-execution-for-same-set-of-inputs-command-l?) — , Jun 21 '18 at 20:24
@Blue: Hmm.. I do not use Windows myself, and only write C code for architectures that are sufficiently POSIXy; that is likely the reason. As you know, Windows and Microsoft products are not exactly known for their support of standards... and as I explained earlier, that code was a very quick mash-up I did in a single evening, definitely not reliable or robust at all. I'll see if I can implement the better approach in C89/C++ that should work with Visual Studio, too. (But do note that basically all HPC is done on non-Windows architectures; you might wish to switch OSes.) — Nominal Animal, Jun 22 '18 at 06:32
I think I spotted the bug: https://i.stack.imgur.com/DsNfY.gif. The value of the pointer djs is FD98D6B2 (see the error message). It's trying to read from an invalid memory address. — , Jun 22 '18 at 06:35
@Blue: Yup; I definitely wasn't happy with the [disjoint-set](https://en.wikipedia.org/wiki/Disjoint-set_data_structure) implementation that combines the color and index to a single identifier (`from` being the index of one there). I should have included range checking there, too. I'm not exactly sure where the invalid `from` originates, but it does not matter overmuch, since I intend to rewrite the algorithm anyway. — Nominal Animal, Jun 22 '18 at 06:37
Yeah, that's likely the problem, but I don't have an idea how to fix it myself. Since you wrote the code, it would be easier for you to spot the mistake (however, yes, I will try to decipher the code myself...but that's going to take considerable amount of time). If you manage to fix it, please let me know here, or via mail. Thanks though :) — , Jun 22 '18 at 06:40

score 0 · Answer 2 · answered Jun 22 '18 at 13:01

This is an addendum to my other answer, showing an example program that calculates both white (0) and black (1) clusters, separately. This was too long to fit in a single answer.

The idea is that we use a separate map for the color of each cell. This map also contains border cells with a color that does not match any color in the actual matrix. The disjoint set is a separate array, with one entry per matrix cell.

To keep track of the two colors, we use a separate array for each color, when counting the number of occurrences per disjoint set root. (Or, in other words, the size of each cluster in the matrix.)

When statistics are collected, the cluster sizes are updated to a separate histogram per color. This scheme seems to be quite performant, even though the memory consumption is significant. (A matrix of R rows and C columns requires about 25*R*C + R + 2*C (plus a constant) bytes, including the histograms.)

Here is the header file, clusters.h, that implements all of the computational logic:

#ifndef   CLUSTERS_H
#define   CLUSTERS_H
#include <stdlib.h>
#include <inttypes.h>
#include <limits.h>
#include <time.h>

/* This file is in public domain. No guarantees, no warranties.
   Written by Nominal Animal <question@nominal-animal.net>.
*/

/* For pure C89 compilers, use '-DSTATIC_INLINE=static' at compile time. */
#ifndef  STATIC_INLINE
#define  STATIC_INLINE  static inline
#endif

#define  ERR_INVALID   -1   /* Invalid function parameter */
#define  ERR_TOOLARGE  -2   /* Matrix size is too large */
#define  ERR_NOMEM     -3   /* Out of memory */

typedef  unsigned char  cluster_color;

typedef  uint32_t  cluster_label;
typedef  uint64_t  cluster_count;

#define  CLUSTER_WHITE  0
#define  CLUSTER_BLACK  1
#define  CLUSTER_NONE   UCHAR_MAX   /* Reserved */

#define  FMT_COLOR  "u"
#define  FMT_LABEL  PRIu32
#define  FMT_COUNT  PRIu64


typedef struct {
    uint64_t        state;
} prng;

typedef struct {
    /* Pseudo-random number generator used */
    prng            rng;

    /* Actual size of the matrix */
    cluster_label   rows;
    cluster_label   cols;

    /* Number of matrices the histograms have been collected from */
    cluster_count   iterations;

    /* Probability of each cell being black */
    uint64_t        p_black;

    /* Probability of diagonal connections */
    uint64_t        d_black;
    uint64_t        d_white;

    /* Cluster colormap contains (rows+2) rows and (cols+1) columns */
    cluster_color  *map;

    /* Disjoint set of (rows) rows and (cols) columns */
    cluster_label  *djs;

    /* Number of occurrences per disjoint set root */
    cluster_label  *white_roots;
    cluster_label  *black_roots;

    /* Histograms of white and black clusters */
    cluster_count  *white_histogram;
    cluster_count  *black_histogram;
} cluster;
#define  CLUSTER_INITIALIZER  { {0}, 0, 0, 0, 0.0, 0.0, 0.0, NULL, NULL, NULL, NULL, NULL, NULL }

/* Calculate uint64_t limit corresponding to probability p. */
STATIC_INLINE uint64_t  probability_limit(const double p)
{
    if (p <= 0.0)
        return UINT64_C(0);
    else
    if (p <= 0.5)
        return (uint64_t)(p * 18446744073709551615.0);
    else
    if (p >= 1.0)
        return UINT64_C(18446744073709551615);
    else
        return UINT64_C(18446744073709551615) - (uint64_t)((double)(1.0 - p) * 18446744073709551615.0);
}

/* Return true at probability corresponding to limit 'limit'.
   This implements a Xorshift64* pseudo-random number generator. */
STATIC_INLINE int  probability(prng *const rng, const uint64_t limit)
{
    uint64_t  state = rng->state;
    uint64_t  value;

    /* To correctly cover the entire range, we ensure we never generate a zero. */
    do {
        state ^= state >> 12;
        state ^= state << 25;
        state ^= state >> 27;
        value  = state * UINT64_C(2685821657736338717);
    } while (!value);

    rng->state = state;

    return (value <= limit) ? CLUSTER_BLACK : CLUSTER_WHITE;
}

/* Generate a random seed for the Xorshift64* pseudo-random number generator. */
static uint64_t  randomize(prng *const rng)
{
    unsigned int  rounds = 127;
    uint64_t      state = UINT64_C(3069887672279) * (uint64_t)time(NULL)
                        ^ UINT64_C(60498839) * (uint64_t)clock();
    if (!state)
        state = 1;

    /* Churn the state a bit. */
    while (rounds-->0) {
        state ^= state >> 12;
        state ^= state << 25;
        state ^= state >> 27;
    }

    if (rng)
        rng->state = state;

    return state;
}

/* Free all resources related to a cluster. */
STATIC_INLINE void free_cluster(cluster *c)
{
    if (c) {
        /* Free dynamically allocated pointers. Note: free(NULL) is safe. */
        free(c->map);
        free(c->djs);
        free(c->white_roots);
        free(c->black_roots);
        free(c->white_histogram);
        free(c->black_histogram);
        c->rng.state       = 0;
        c->rows            = 0;
        c->cols            = 0;
        c->iterations      = 0;
        c->p_black         = 0;
        c->d_white         = 0;
        c->d_black         = 0;
        c->map             = NULL;
        c->djs             = NULL;
        c->white_roots     = 0;
        c->black_roots     = 0;
        c->white_histogram = NULL;
        c->black_histogram = NULL;
    }
}

/* Initialize cluster structure, for a matrix of specified size. */
static int init_cluster(cluster *c, const int rows, const int cols,
                        const double p_black,
                        const double d_white, const double d_black)
{
    const cluster_label  label_cols = cols;
    const cluster_label  label_rows = rows;
    const cluster_label  color_rows = rows + 2;
    const cluster_label  color_cols = cols + 1;
    const cluster_label  color_cells = color_rows * color_cols;
    const cluster_label  label_cells = label_rows * label_cols;
    const cluster_label  labels = label_cells + 2; /* One extra! */

    if (!c)
        return ERR_INVALID;

    c->rng.state       = 0; /* Invalid seed for Xorshift64*. */
    c->rows            = 0;
    c->cols            = 0;
    c->iterations      = 0;
    c->p_black         = 0;
    c->d_white         = 0;
    c->d_black         = 0;
    c->map             = NULL;
    c->djs             = NULL;
    c->white_roots     = NULL;
    c->black_roots     = NULL;
    c->white_histogram = NULL;
    c->black_histogram = NULL;

    if (rows < 1 || cols < 1)
        return ERR_INVALID;

    if ((unsigned int)color_rows <= (unsigned int)rows ||
        (unsigned int)color_cols <= (unsigned int)cols ||
        (cluster_label)(color_cells / color_rows) != color_cols ||
        (cluster_label)(color_cells / color_cols) != color_rows ||
        (cluster_label)(label_cells / label_rows) != label_cols ||
        (cluster_label)(label_cells / label_cols) != label_rows)
        return ERR_TOOLARGE;

    c->black_histogram = calloc(labels, sizeof (cluster_count));
    c->white_histogram = calloc(labels, sizeof (cluster_count));
    c->black_roots = calloc(labels, sizeof (cluster_label));
    c->white_roots = calloc(labels, sizeof (cluster_label));
    c->djs = calloc(label_cells, sizeof (cluster_label));
    c->map = calloc(color_cells, sizeof (cluster_color));
    if (!c->map || !c->djs ||
        !c->white_roots || !c->black_roots ||
        !c->white_histogram || !c->black_histogram) {
        free(c->map);
        free(c->djs);
        free(c->white_roots);
        free(c->black_roots);
        free(c->white_histogram);
        free(c->black_histogram);
        return ERR_NOMEM;
    }

    c->rows = rows;
    c->cols = cols;

    c->p_black = probability_limit(p_black);
    c->d_white = probability_limit(d_white);
    c->d_black = probability_limit(d_black);

    /* Initialize the color map to NONE. */
    {
        cluster_color        *ptr = c->map;
        cluster_color *const  end = c->map + color_cells;
        while (ptr < end)
            *(ptr++) = CLUSTER_NONE;
    }

    /* calloc() initialized the other arrays to zeros already. */
    return 0;
}

/* Disjoint set: find root. */
STATIC_INLINE cluster_label  djs_root(const cluster_label *const  djs, cluster_label  from)
{
    while (from != djs[from])
        from = djs[from];
    return from;
}

/* Disjoint set: path compression. */
STATIC_INLINE void  djs_path(cluster_label *const  djs, cluster_label  from, const cluster_label  root)
{
    while (from != root) {
        const cluster_label  temp = djs[from];
        djs[from] = root;
        from = temp;
    }
}

/* Disjoint set: Flatten. Returns the root, and flattens the path to it. */
STATIC_INLINE cluster_label  djs_flatten(cluster_label *const  djs, cluster_label  from)
{
    const cluster_label  root = djs_root(djs, from);
    djs_path(djs, from, root);
    return root;
}

/* Disjoint set: Join two subsets. */
STATIC_INLINE void  djs_join2(cluster_label *const  djs,
                              cluster_label  from1,  cluster_label  from2)
{
    cluster_label  root, temp;

    root = djs_root(djs, from1);

    temp = djs_root(djs, from2);
    if (root > temp)
        temp = root;

    djs_path(djs, from1, root);
    djs_path(djs, from2, root);
}

/* Disjoint set: Join three subsets. */
STATIC_INLINE void  djs_join3(cluster_label *const  djs,
                              cluster_label  from1, cluster_label  from2,
                              cluster_label  from3)
{
    cluster_label  root, temp;

    root = djs_root(djs, from1);

    temp = djs_root(djs, from2);
    if (root > temp)
        root = temp;

    temp = djs_root(djs, from3);
    if (root > temp)
        root = temp;

    djs_path(djs, from1, root);
    djs_path(djs, from2, root);
    djs_path(djs, from3, root);
}

/* Disjoint set: Join four subsets. */
STATIC_INLINE void  djs_join4(cluster_label *const  djs,
                              cluster_label  from1, cluster_label  from2,
                              cluster_label  from3, cluster_label  from4)
{
    cluster_label  root, temp;

    root = djs_root(djs, from1);

    temp = djs_root(djs, from2);
    if (root > temp)
        root = temp;

    temp = djs_root(djs, from3);
    if (root > temp)
        root = temp;

    temp = djs_root(djs, from4);
    if (root > temp)
        root = temp;

    djs_path(djs, from1, root);
    djs_path(djs, from2, root);
    djs_path(djs, from3, root);
    djs_path(djs, from4, root);
}

/* Disjoint set: Join five subsets. */
STATIC_INLINE void  djs_join5(cluster_label *const  djs,
                              cluster_label  from1, cluster_label  from2,
                              cluster_label  from3, cluster_label  from4,
                              cluster_label  from5)
{
    cluster_label  root, temp;

    root = djs_root(djs, from1);

    temp = djs_root(djs, from2);
    if (root > temp)
        root = temp;

    temp = djs_root(djs, from3);
    if (root > temp)
        root = temp;

    temp = djs_root(djs, from4);
    if (root > temp)
        root = temp;

    temp = djs_root(djs, from5);
    if (root > temp)
        root = temp;

    djs_path(djs, from1, root);
    djs_path(djs, from2, root);
    djs_path(djs, from3, root);
    djs_path(djs, from4, root);
    djs_path(djs, from5, root);
}

static void iterate(cluster *const cl)
{
    prng          *const  rng = &(cl->rng);
    uint64_t       const  p_black = cl->p_black;
    uint64_t              d_color[2];

    cluster_color *const  map = cl->map + cl->cols + 2;
    cluster_label  const  map_stride = cl->cols + 1;

    cluster_label *const  djs = cl->djs;

    cluster_label        *roots[2];

    cluster_label  const  rows = cl->rows;
    cluster_label  const  cols = cl->cols;

    int                   r, c;

    d_color[CLUSTER_WHITE] = cl->d_white;
    d_color[CLUSTER_BLACK] = cl->d_black;
    roots[CLUSTER_WHITE] = cl->white_roots;
    roots[CLUSTER_BLACK] = cl->black_roots;

    for (r = 0; r < rows; r++) {
        cluster_label  const  curr_i = r * cols;
        cluster_color *const  curr_row = map + r * map_stride;
        cluster_color *const  prev_row = curr_row - map_stride;

        for (c = 0; c < cols; c++) {
            cluster_color  color = probability(rng, p_black);
            cluster_label  label = curr_i + c;
            uint64_t       diag  = d_color[color];
            unsigned int   joins = 0;

            /* Assign the label and color of the current cell, */
            djs[label] = label;
            curr_row[c] = color;

            /* Because we join left, up-left, up, and up-right, and
               all those have been assigned to already, we can do
               the necessary joins right now. */

            /* Join left? */
            joins |= (curr_row[c-1] == color) << 0;

            /* Join up? */
            joins |= (prev_row[c  ] == color) << 1;

            /* Join up left? */
            joins |= (prev_row[c-1] == color && probability(rng, diag)) << 2;

            /* Join up right? */
            joins |= (prev_row[c+1] == color && probability(rng, diag)) << 3;

            /* Do the corresponding joins. */
            switch (joins) {
            case 1: /* Left */
                djs_join2(djs, label, label - 1);
                break;
            case 2: /* Up */
                djs_join2(djs, label, label - cols);
                break;
            case 3: /* Left and up */
                djs_join3(djs, label, label - 1, label - cols);
                break;
            case 4: /* Up-left */
                djs_join2(djs, label, label - cols - 1);
                break;
            case 5: /* Left and up-left */
                djs_join3(djs, label, label - 1, label - cols - 1);
                break;
            case 6: /* Up and up-left */
                djs_join3(djs, label, label - cols, label - cols - 1);
                break;
            case 7: /* Left, up, and up-left */
                djs_join4(djs, label, label - 1, label - cols, label - cols - 1);
                break;
            case 8: /* Up-right */
                djs_join2(djs, label, label - cols + 1);
                break;
            case 9: /* Left and up-right */
                djs_join3(djs, label, label - 1, label - cols + 1);
                break;
            case 10: /* Up and up-right */
                djs_join3(djs, label, label - cols, label - cols + 1);
                break;
            case 11: /* Left, up, and up-right */
                djs_join4(djs, label, label - 1, label - cols, label - cols + 1);
                break;
            case 12: /* Up-left and up-right */
                djs_join3(djs, label, label - cols - 1, label - cols + 1);
                break;
            case 13: /* Left, up-left, and up-right */
                djs_join4(djs, label, label - 1, label - cols - 1, label - cols + 1);
                break;
            case 14: /* Up, up-left, and up-right */
                djs_join4(djs, label, label - cols, label - cols - 1, label - cols + 1);
                break;
            case 15: /* Left, up, up-left, and up-right */
                djs_join5(djs, label, label - 1, label - cols, label - cols - 1, label - cols + 1);
                break;
            }
        }
    }

    /* Count the occurrences of each disjoint-set root label. */
    if (roots[0] && roots[1]) {
        const size_t  labels = rows * cols + 2;

        /* Clear the counts. */
        memset(roots[0], 0, labels * sizeof (cluster_label));
        memset(roots[1], 0, labels * sizeof (cluster_label));

        for (r = 0; r < rows; r++) {
            const cluster_color *const  curr_row = map + r * map_stride;
            const cluster_label         curr_i   = r * cols;
            for (c = 0; c < cols; c++)
                roots[curr_row[c]][djs_flatten(djs, curr_i + c)]++;
        }
    } else {
        size_t  i = rows * cols;
        while (i-->0)
            djs_flatten(djs, i);
    }

    /* Collect the statistics. */
    if (cl->white_histogram && roots[0]) {
        const cluster_label *const  root_count = roots[0];
        cluster_count *const        histogram = cl->white_histogram;
        size_t                      i = rows * cols + 1;
        while (i-->0)
            histogram[root_count[i]]++;
    }
    if (cl->black_histogram && roots[1]) {
        const cluster_label *const  root_count = roots[1];
        cluster_count *const        histogram = cl->black_histogram;
        size_t                      i = rows * cols + 1;
        while (i-->0)
            histogram[root_count[i]]++;
    }

    /* Note: index zero and (rows*cols+1) are zero in the histogram, for ease of scanning. */
    if (cl->white_histogram || cl->black_histogram) {
        const size_t  n = rows * cols + 1;
        if (cl->white_histogram) {
            cl->white_histogram[0] = 0;
            cl->white_histogram[n] = 0;
        }
        if (cl->black_histogram) {
            cl->black_histogram[0] = 0;
            cl->black_histogram[n] = 0;
        }
    }

    /* One more iteration performed. */
    cl->iterations++;
}

#endif /* CLUSTERS_H */

Here is an example program, distribution.c, that collects cluster size statistics over a number of iterations, and outputs them in format suitable for plotting with for example Gnuplot. Run it without arguments to see usage and help.

#include <stdlib.h>
#include <inttypes.h>
#include <string.h>
#include <stdio.h>
#include "clusters.h"

/* This file is in public domain. No guarantees, no warranties.
   Written by Nominal Animal <question@nominal-animal.net>.
*/

#define  DEFAULT_ROWS     100
#define  DEFAULT_COLS     100
#define  DEFAULT_P_BLACK  0.0
#define  DEFAULT_D_WHITE  0.0
#define  DEFAULT_D_BLACK  0.0
#define  DEFAULT_ITERS    1

int usage(const char *argv0)
{
    fprintf(stderr, "\n");
    fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv0);
    fprintf(stderr, "       %s OPTIONS [ > output.txt ]\n", argv0);
    fprintf(stderr, "\n");
    fprintf(stderr, "Options:\n");
    fprintf(stderr, "       rows=SIZE   Set number of rows. Default is %d.\n", DEFAULT_ROWS);
    fprintf(stderr, "       cols=SIZE   Set number of columns. Default is %d.\n", DEFAULT_ROWS);
    fprintf(stderr, "       L=SIZE      Set rows=SIZE and cols=SIZE.\n");
    fprintf(stderr, "       black=P     Set the probability of a cell to be black. Default is %g.\n", DEFAULT_P_BLACK);
    fprintf(stderr, "                   All non-black cells are white.\n");
    fprintf(stderr, "       dwhite=P    Set the probability of white cells connecting diagonally.\n");
    fprintf(stderr, "                   Default is %g.\n", DEFAULT_D_WHITE);
    fprintf(stderr, "       dblack=P    Set the probability of black cells connecting diagonally.\n");
    fprintf(stderr, "                   Default is %g.\n", DEFAULT_D_BLACK);
    fprintf(stderr, "       N=COUNT     Number of iterations for gathering statistics. Default is %d.\n", DEFAULT_ITERS);
    fprintf(stderr, "       seed=U64    Set the Xorshift64* pseudorandom number generator seed; nonzero.\n");
    fprintf(stderr, "                   Default is to pick one randomly (based on time).\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "The output consists of comment lines and data lines.\n");
    fprintf(stderr, "Comment lines begin with a #:\n");
    fprintf(stderr, "   # This is a comment line.\n");
    fprintf(stderr, "Each data line contains a cluster size, the number of white clusters of that size\n");
    fprintf(stderr, "observed during iterations, the number of black clusters of that size observed\n");
    fprintf(stderr, "during iterations, and the number of any clusters of that size observed:\n");
    fprintf(stderr, "   SIZE  WHITE_CLUSTERS  BLACK_CLUSTERS  TOTAL_CLUSTERS\n");
    fprintf(stderr, "\n");
    return EXIT_SUCCESS;
}

int main(int argc, char *argv[])
{
    int      rows = DEFAULT_ROWS;
    int      cols = DEFAULT_COLS;
    double   p_black = DEFAULT_P_BLACK;
    double   d_white = DEFAULT_D_WHITE;
    double   d_black = DEFAULT_D_BLACK;
    long     iters = DEFAULT_ITERS;
    uint64_t seed = 0;
    cluster  c = CLUSTER_INITIALIZER;

    int      arg, itemp;
    uint64_t u64temp;
    double   dtemp;
    long     ltemp;
    char     dummy;

    size_t   n;
    size_t   i;

    if (argc < 2)
        return usage(argv[0]);

    for (arg = 1; arg < argc; arg++)
        if (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "/?") || !strcmp(argv[arg], "--help"))
            return usage(argv[0]);
        else
        if (sscanf(argv[arg], "L=%d %c", &itemp, &dummy) == 1 ||
            sscanf(argv[arg], "l=%d %c", &itemp, &dummy) == 1 ||
            sscanf(argv[arg], "size=%d %c", &itemp, &dummy) == 1) {
            rows = itemp;
            cols = itemp;
        } else
        if (sscanf(argv[arg], "seed=%" SCNu64 " %c", &u64temp, &dummy) == 1 ||
            sscanf(argv[arg], "seed=%" SCNx64 " %c", &u64temp, &dummy) == 1 ||
            sscanf(argv[arg], "s=%" SCNu64 " %c", &u64temp, &dummy) == 1 ||
            sscanf(argv[arg], "s=%" SCNx64 " %c", &u64temp, &dummy) == 1) {
            seed = u64temp;
        } else
        if (sscanf(argv[arg], "N=%ld %c", &ltemp, &dummy) == 1 ||
            sscanf(argv[arg], "n=%ld %c", &ltemp, &dummy) == 1 ||
            sscanf(argv[arg], "count=%ld %c", &ltemp, &dummy) == 1) {
            iters = ltemp;
        } else
        if (sscanf(argv[arg], "rows=%d %c", &itemp, &dummy) == 1 ||
            sscanf(argv[arg], "r=%d %c", &itemp, &dummy) == 1 ||
            sscanf(argv[arg], "height=%d %c", &itemp, &dummy) == 1 ||
            sscanf(argv[arg], "h=%d %c", &itemp, &dummy) == 1) {
            rows = itemp;
        } else
        if (sscanf(argv[arg], "columns=%d %c", &itemp, &dummy) == 1 ||
            sscanf(argv[arg], "cols=%d %c", &itemp, &dummy) == 1 ||
            sscanf(argv[arg], "c=%d %c", &itemp, &dummy) == 1 ||
            sscanf(argv[arg], "width=%d %c", &itemp, &dummy) == 1 ||
            sscanf(argv[arg], "w=%d %c", &itemp, &dummy) == 1) {
            cols = itemp;
        } else
        if (sscanf(argv[arg], "black=%lf %c", &dtemp, &dummy) == 1 ||
            sscanf(argv[arg], "p0=%lf %c", &dtemp, &dummy) == 1 ||
            sscanf(argv[arg], "b=%lf %c", &dtemp, &dummy) == 1 ||
            sscanf(argv[arg], "P=%lf %c", &dtemp, &dummy) == 1 ||
            sscanf(argv[arg], "p0=%lf %c", &dtemp, &dummy) == 1 ||
            sscanf(argv[arg], "p=%lf %c", &dtemp, &dummy) == 1) {
            p_black = dtemp;
        } else
        if (sscanf(argv[arg], "white=%lf %c", &dtemp, &dummy) == 1 ||
            sscanf(argv[arg], "p1=%lf %c", &dtemp, &dummy) == 1) {
            p_black = 1.0 - dtemp;
        } else
        if (sscanf(argv[arg], "dwhite=%lf %c", &dtemp, &dummy) == 1 ||
            sscanf(argv[arg], "dw=%lf %c", &dtemp, &dummy) == 1 ||
            sscanf(argv[arg], "d0=%lf %c", &dtemp, &dummy) == 1) {
            d_white = dtemp;
        } else
        if (sscanf(argv[arg], "dblack=%lf %c", &dtemp, &dummy) == 1 ||
            sscanf(argv[arg], "db=%lf %c", &dtemp, &dummy) == 1 ||
            sscanf(argv[arg], "d1=%lf %c", &dtemp, &dummy) == 1) {
            d_black = dtemp;
        } else {
            fprintf(stderr, "%s: Unknown option.\n", argv[arg]);
            return EXIT_FAILURE;
        }

    switch (init_cluster(&c, rows, cols, p_black, d_white, d_black)) {
    case 0: break; /* OK */
    case ERR_INVALID:
        fprintf(stderr, "Invalid size.\n");
        return EXIT_FAILURE;
    case ERR_TOOLARGE:
        fprintf(stderr, "Size is too large.\n");
        return EXIT_FAILURE;
    case ERR_NOMEM:
        fprintf(stderr, "Not enough memory.\n");
        return EXIT_FAILURE;
    }

    if (!seed)
        seed = randomize(NULL);

    c.rng.state = seed;

    /* The largest possible cluster has n cells. */
    n = (size_t)rows * (size_t)cols;

    /* Print the comments describing the initial parameters. */
    printf("# seed: %" PRIu64 " (Xorshift 64*)\n", seed);
    printf("# size: %d rows, %d columns\n", rows, cols);
    printf("# P(black): %.6f (%" PRIu64 "/18446744073709551615)\n", p_black, c.p_black);
    printf("# P(black connected diagonally): %.6f (%" PRIu64 "/18446744073709551615)\n", d_black, c.d_black);
    printf("# P(white connected diagonally): %.6f (%" PRIu64 "/18446744073709551615)\n", d_white, c.d_white);
    fflush(stdout);

    while (iters-->0)
        iterate(&c);

    printf("# Iterations: %" PRIu64 "\n", c.iterations);
    printf("#\n");
    printf("# size  white_clusters(size) black_clusters(size) clusters(size)\n");

    /* Note: c._histogram[0] == c._histogram[n] == 0, for ease of scanning. */
    for (i = 1; i <= n; i++)
        if (c.white_histogram[i-1] || c.white_histogram[i] || c.white_histogram[i+1] ||
            c.black_histogram[i-1] || c.black_histogram[i] || c.black_histogram[i+1])
            printf("%lu %" FMT_COUNT " %" FMT_COUNT " %" FMT_COUNT "\n",
                   (unsigned long)i,
                   c.white_histogram[i],
                   c.black_histogram[i],
                   c.white_histogram[i]+c.black_histogram[i]);

    /* Since we are exiting anyway, this is not really necessary. */
    free_cluster(&c);

    /* All done. */
    return EXIT_SUCCESS;
}

Here is a plot generated by running distribution.c with L=100 black=0.5 dwhite=0 dblack=0.25 seed=20120622 N=100000 (cluster size distributions collected from hundred thousand 100x100 matrices, where the probability of a nonzero cell is 50%, and there is a 25% probability of nonzero cells connecting diagonally, zero cells never connecting diagonally, only horizontally and vertically): Example cluster distribution As you can see, because nonzero clusters can sometimes connect diagonally too, there are more larger nonzero clusters than zero clusters. The computation took about 37 seconds on an i5-7200U laptop. Because the Xorshift64* seed is given explicitly, this is a repeatable test.

What would be an efficient way to find the averaged size distribution without having to declare a gigantic 2D array?

2 Answers2

Linked