6

I am trying to get a count of all the times a byte sequences occurs in another byte sequences. It cannot however re-use a bytes if it already counted them. For example given the string
k.k.k.k.k.k. let's assume the byte sequence was k.k it would then find only 3 occurrences rather than 5 because they would be broke down like: [k.k].[k.k].[k.k]. and not like [k.[k].[k].[k].[k].k] where they over lap and essentially just shift 2 to the right.

Ideally the idea is to get an idea how a compression dictionary or run time encoding might look. so the goal would be to get

k.k.k.k.k.k. down to just 2 parts, as (k.k.k.) is the biggest and best symbol you can have.

Here is source so far:

using System;
using System.Collections.Generic;
using System.Collections;
using System.Linq;
using System.Text;
using System.IO;


    static class Compression 
    {
        static int Main(string[] args)
        {

            List<byte> bytes = File.ReadAllBytes("ok.txt").ToList();
            List<List<int>> list = new List<List<int>>();

            // Starting Numbers of bytes - This can be changed manually.
            int StartingNumBytes = bytes.Count;
            for (int i = StartingNumBytes; i > 0; i--)
            {
                Console.WriteLine("i: " + i);

                for (int ii = 0; ii < bytes.Count - i; ii++)
                {
                    Console.WriteLine("ii: " + i);
                    // New pattern comes with refresh data.
                    List<byte> pattern = new List<byte>();

                    for (int iii = 0; iii < i; iii++)
                    {
                        pattern.Add(bytes[ii + iii]);
                    }



                    DisplayBinary(bytes, "red");
                    DisplayBinary(pattern, "green");

                    int matches = 0;
                   // foreach (var position in bytes.ToArray().Locate(pattern.ToArray()))
                    for (int position = 0; position < bytes.Count; position++) {
                        if (pattern.Count > (bytes.Count - position))
                        {
                            continue;
                        }


                        for (int iiii = 0; iiii < pattern.Count; iiii++)
                        {
                            if (bytes[position + iiii] != pattern[iiii])
                            {
                                //Have to use goto because C# doesn't support continue <level>
                                goto outer;
                            }

                        }

                        // If it made it this far, it has found a match.
                        matches++;
                        Console.WriteLine("Matches: " + matches + " Orig Count: " + bytes.Count + " POS: " + position);
                        if (matches > 1)
                        {
                            int numBytesToRemove = pattern.Count;
                            for (int ra = 0; ra < numBytesToRemove; ra++)
                            {
                                // Remove it at the position it was found at, once it
                                // deletes the first one, the list will shift left and you'll need to be here again.
                                bytes.RemoveAt(position);
                            }
                            DisplayBinary(bytes, "red");
                            Console.WriteLine(pattern.Count + " Bytes removed.");

                            // Since you deleted some bytes, set the position less because you will need to redo the pos.
                            position = position - 1;
                        }


                        outer:
                            continue;
                    }

                    List<int> sublist = new List<int>();
                    sublist.Add(matches);
                    sublist.Add(pattern.Count);
                    // Some sort of calculation to determine how good the symbol was
                    sublist.Add(bytes.Count-((matches * pattern.Count)-matches));
                    list.Add(sublist);

                }

            }



            Display(list);
            Console.Read();
            return 0;
        }


        static void DisplayBinary(List<byte> bytes, string color="white")
        {
            switch(color){
                case "green":
                    Console.ForegroundColor = ConsoleColor.Green;
                    break;
                case "red":
                    Console.ForegroundColor = ConsoleColor.Red;
                    break;
                default:
                    break;
            }


            for (int i=0; i<bytes.Count; i++)
            {
                if (i % 8 ==0)
                    Console.WriteLine();
                Console.Write(GetIntBinaryString(bytes[i]) + " ");
            }
            Console.WriteLine();
            Console.ResetColor();
        }
        static string GetIntBinaryString(int n)
        {
            char[] b = new char[8];
            int pos = 7;
            int i = 0;

            while (i < 8)
            {
                if ((n & (1 << i)) != 0)
                {
                    b[pos] = '1';
                }
                else
                {
                    b[pos] = '0';
                }
                pos--;
                i++;
            }
            //return new string(b).TrimStart('0');
            return new string(b);
        }

        static void Display(List<List<int>> list)
        {
            //
            // Display everything in the List.
            //
            Console.WriteLine("Elements:");
            foreach (var sublist in list)
            {
                foreach (var value in sublist)
                {
                    Console.Write("{0,4}", value);

                }
                Console.WriteLine();
            }

            //
            // Display total count.
            //
            int count = 0;
            foreach (var sublist in list)
            {
                count += sublist.Count;
            }
            Console.WriteLine("Count:");
            Console.WriteLine(count);
        }

        static public int SearchBytePattern(byte[] pattern, byte[] bytes)
        {
            int matches = 0;
            // precomputing this shaves some seconds from the loop execution
            int maxloop = bytes.Length - pattern.Length;
            for (int i = 0; i < maxloop; i++)
            {
                if (pattern[0] == bytes[i])
                {
                    bool ismatch = true;
                    for (int j = 1; j < pattern.Length; j++)
                    {
                        if (bytes[i + j] != pattern[j])
                        {
                            ismatch = false;
                            break;
                        }
                    }
                    if (ismatch)
                    {
                        matches++;
                        i += pattern.Length - 1;
                    }
                }
            }
            return matches;
        }
    }

Refer to the post to get the non binary of the file should be, here is the binary data: 011010110010111001101011001011100110101100101110011010110010111001101011001011100110101100101110 I am hope to have it smaller than how it started.

Joel Coehoorn
  • 399,467
  • 113
  • 570
  • 794
ParoX
  • 5,685
  • 23
  • 81
  • 152
  • Possibly calling context is relevant here. E.g. if we know that we will be looking for many patterns in a single byte array and each pattern is teh same length then it probably makes sense to precalc hash codes for each possibel sub-string in the search array - you can then compare hash codes (e.g. typically a single int32) instead of every byte in a sequence. But you do still have to test every byte if the has codes match (hash codes tell you that a sequence definitely doesn't match or else it /might/ match) – redcalx Jul 16 '11 at 12:30
  • You can have a look here: http://stackoverflow.com/questions/283456/byte-array-pattern-search This covers most of what you are looking for. – Variant Jul 10 '11 at 14:38

3 Answers3

7
private static int CountOccurences(byte[] target, byte[] pattern)
{
    var targetString = BitConverter.ToString(target);
    var patternString = BitConverter.ToString(pattern);
    return new Regex(patternString).Matches(targetString).Count;
}
Yuriy Rozhovetskiy
  • 22,270
  • 4
  • 37
  • 68
  • +1 nice and simple, tested with `byte[] target = new byte[] { 1, 2, 3, 2, 3, 1, 2, 3 }; byte[] pattern = new byte[] { 2, 3 };` – shelleybutterfly Jul 11 '11 at 22:13
  • Interesting but note that this is a highly inefficient approach - the point is relevant for processing large amounts of data and/or running on a live server environment with strict requirements on rountrip times and throughput. – redcalx Jul 16 '11 at 12:26
2

With this solution you'd have access to the individual indexes that matched (while enumerating) or you could call Count() on the result to see how many matches there were:

public static IEnumerable<int> Find<T>(T[] pattern, T[] sequence, bool overlap)
{
    int i = 0;
    while (i < sequence.Length - pattern.Length + 1)
    {
        if (pattern.SequenceEqual(sequence.Skip(i).Take(pattern.Length)))
        {
            yield return i;
            i += overlap ? 1 : pattern.Length;
        }
        else
        {
            i++;
        }
    }
}

Call it with overlap: false to solve your problem or overlap: true to see the overlapped matches (if you're interested.)

I have a couple of other methods with slightly different API (along with better performance) here, including one that work directly on streams of bytes.

Jono
  • 1,964
  • 4
  • 18
  • 35
1

quick and dirty with no regex. although i'm not sure if it answers the intent of the question, it should be relatively fast. i think i am going to run some timing tests against regex to see for sure the relative speeds:

    private int CountOccurrences(string TestString, string TestPattern)
    {
        int PatternCount = 0;
        int SearchIndex = 0;

        if (TestPattern.Length == 0)
            throw new ApplicationException("CountOccurrences: Unable to process because TestPattern has zero length.");

        if (TestString.Length == 0)
            return 0;

        do
        {
            SearchIndex = TestString.IndexOf(TestPattern, SearchIndex);

            if (SearchIndex >= 0)
            {
                ++PatternCount;
                SearchIndex += TestPattern.Length;
            }
        }
        while ((SearchIndex >= 0) && (SearchIndex < TestString.Length));

        return PatternCount;
    }

    private void btnTest_Click(object sender, EventArgs e)
    {
        string TestString1 = "k.k.k.k.k.k.k.k.k.k.k.k";
        string TestPattern1 = "k.k";

        System.Console.WriteLine(CountOccurrences(TestString1, TestPattern1).ToString()); // outputs 6
        System.Console.WriteLine(CountOccurrences(TestString1 + ".k", TestPattern1).ToString()); // still 6
        System.Console.WriteLine(CountOccurrences(TestString1, TestPattern1 + ".").ToString()); // only 5
    }
shelleybutterfly
  • 3,216
  • 15
  • 32
  • also, your post got me thinking about huffman encoding, i always though it was fun to play around with... here's a good intro if you are interested: http://www.cs.auckland.ac.nz/~jmor159/PLDS210/huffman.html – shelleybutterfly Jul 11 '11 at 21:56
  • upon further comparison with your question's code i see that really this isn't what you need because you're doing binary... i will leave the code up just to say: i would be tempted to try doing something like just passing in byte[]s and then casting to string and searching e.g. "string TestString = System.Text.Encoding.ASCII.GetString(TestBytes);" except there are some matching rules for some cultures that would preclude this working correctly. Ahh well. :) – shelleybutterfly Jul 11 '11 at 22:07