3

I have function that searches for string in a large binary file and give its position to me. How to implement a function that read that position and give me string after specific length. As we do in String.Substring()

Here is the code I have so far.

public void example()
{

    string match = "400000002532"; //This is 12 chars in hex of the string to search
    byte[] matchBytes = StringToByteArray(match);


    foreach (var jsFile in jsscan)
    {
        using (var fs = new FileStream(jsFile, FileMode.Open))
        {
            int i = 0;
            int readByte;
            while ((readByte = fs.ReadByte()) != -1)
            {
                if (matchBytes[i] == readByte)
                {
                    i++;
                }
                else
                {
                    i = 0;
                }
                if (i == matchBytes.Length)
                {
                    Console.WriteLine("It found between {0} and {1}.", 
                       fs.Position - matchBytes.Length, fs.Position);
                    break;
                }
            }
       }
    }
}
public static byte[] StringToByteArray(String hex)
{
    int NumberChars = hex.Length;
    byte[] bytes = new byte[NumberChars / 2];
    for (int i = 0; i < NumberChars; i += 2)
            bytes[i / 2] = Convert.ToByte(hex.Substring(i, 2), 16);
    return bytes;
 }

Example what I am searching is below

400000002532010953667A51E44BE5B6A59417B71F4B91BBE590B6AF6E84EF570C32C56400E05123B0A44AF389331E4B91B02E8980B85157F910D7238918A73012B6243F772F7B60E5A7CF6E8CB25374B8FF96311130AABD9F71A860C904C9F6AE9706E570CC0E881E997762710EDE8818CCC551BA05579D30C0D53CEBD9BAF0C2E557D7B9D37A9C94A8A9B5FA7FCF7973B0BDA88A06DE1AE357130E4A06018ABB0A1ABD818DABEB518649CF885953EE05564FD69F0E2F860175667C5FC84F1C97727CEA1C841BFA86A26BABA942E0275FAB2A8F78132E3A05404F0DCD02FD4E7CAD08B1FFD4C2184400F22F6EBC14857BCC2E2AF858BE20CBB807C3467A91E38F31901FD452B5F87F296174631980E039CAB58D97E8F91E3255DD7DEF3177D68A4943F629A70B421B1D6E53DC0D26A1B5EF7C6912F48E0842037FA72B17C18E11B93AEE4DDA0FFE6F217BD5DEB957B1C26169029DE4396103D1F89FA0856489B1958DE5C896DB8F27A24C21AC66BF2095E383DA5EC6DA7138FE82C62FDE9BEFF0308F507736F1B35B1CA083F6C96A6860889BDCCBC989E86F4FB1C483E71557369E7308450330AEF8C9A13A115E8A97642E4A0A4098F5BC04A096A22E5F97116B59AE17BCAEFD2A8B0BCB5341EC64CA3E474900D5A8A620448A6C97827C42332C4DD326572A3C5DB4DA1362F3C0012E1AA1B70C812DCCAEF74F67E94E907518CA31945DD56A61A7
Aman Ali
  • 51
  • 8
  • So you DON'T want to know how to get the position, you just want to know how to convert a certain number of bytes following the target string into a string? If so, how do you know how many chars there should be in the following string? – Matthew Watson May 29 '18 at 09:59
  • Assuming you have utf-16 encoded bytes, know start index and byte count it's trivial to extract substring from byte array with Encoding.Unicode.GetString(bytes, start, count) – Igor Bendrup May 29 '18 at 10:00
  • @IgorBendrup It's not quite as simple as you may think, due to the fact that there might not be exactly two bytes per character (in UTF16) or one byte per character (in UTF8) because some characters may be encoded in three or more bytes. – Matthew Watson May 29 '18 at 10:08
  • @MatthewWatson I have a series of hex I want to search like the first hex string has length of 1040 chars and other 2 hex string lengh is 512 chars. – Aman Ali May 29 '18 at 10:11
  • @MatthewWatson It doesn't make sense if OP knows exact count of bytes (not characters) which he wants to convert to string. If OP doesn't know byte count, but knows only character count, he could use Encoding.GetCharCount(bytes, start, byteCount) iteratively to get proper byte count – Igor Bendrup May 29 '18 at 10:26
  • @IgorBendrup If you know the answer please write it. – Aman Ali May 29 '18 at 11:00

2 Answers2

1

If performance is not of huge concern, you could do the following, which is more easy and readable

using (var fs = new StreamReader(fileName))
{
    var content = await fs.ReadToEndAsync();
    var pos = content.IndexOf(matchBytes);

    if (pos != -1)
    {
        Console.WriteLine($"Found @ {pos}, {pos + matchBytes.Length}");
    }
}
Alen Alex
  • 897
  • 4
  • 15
  • 31
1

Assuming you know which Encoding is used to store characters in the Stream, try this function:

static string GetString(Stream stream, long position, int stringLength, Encoding encoding) {
    int offset = 0;
    int readByte;
    byte[] buffer = new byte[stream.Length - position];
    stream.Seek(position, SeekOrigin.Begin);
    while ((readByte = stream.ReadByte()) != -1)
    {
        buffer[offset++] = (byte)readByte;
        if (encoding.GetCharCount(buffer, 0, offset) == stringLength + 1)
        {                    
             return encoding.GetString(buffer, 0, offset - 1);
        }
    }
    if (encoding.GetCharCount(buffer, 0, offset) == stringLength)
    {
        return encoding.GetString(buffer, 0, offset);
    }
    throw new Exception(string.Format("Stream doesn't contains {0} characters", stringLength));
}

For example, with your code and utf-16:

using (var fs = new FileStream(jsFile, FileMode.Open))
{
    int i = 0;
    int readByte;
    while ((readByte = fs.ReadByte()) != -1)
    {
        if (matchBytes[i] == readByte)
        {
            i++;
        }
        else
        {
            i = 0;
        }
        if (i == matchBytes.Length)
        {
            Console.WriteLine("It found between {0} and {1}.",
                        fs.Position - matchBytes.Length, fs.Position);

            //Desired string length in charachters
            const int DESIRED_STRING_LENGTH = 5;
            Console.WriteLine(GetString(fs, fs.Position, DESIRED_STRING_LENGTH, Encoding.Unicode));

            break;
        }
    }
}
Igor Bendrup
  • 2,637
  • 2
  • 16
  • 15
  • This thing is becoming pretty tricky cause I dont know the encoding. – Aman Ali May 29 '18 at 15:46
  • @AmanAli Encoding detection is very different task. Check out [this](https://stackoverflow.com/questions/3825390/effective-way-to-find-any-files-encoding). But it's better to ask the author of the file about used encoding. – Igor Bendrup May 29 '18 at 17:18
  • The author is unknown sadly. – Aman Ali May 29 '18 at 18:58