16

I have an array of bytes (any length), and I want to encode this array into string using my own base encoder. In .NET is standard Base64 encoder, but what if I want to encode the array in Base62, Base53 or Base13?

Is it even possible to create such universal base encoder?

I know I could do it the simple way, that is, for each byte reserve fixed number of chars (in case of Base62, that would be 5 chars), and do direct byte->chars encoding, but I would be wasting space, as 5 Base62 chars are able to contain more than 1 byte, but less than 2 bytes.

How should I write such an encoder? Or is there already some class for this?
And please note that I need universal decoder as well, otherwise this is useless to me.

Resources

As the solution is already known (use BigInteger), I would just like to put here some resources relating the BigInteger class, as it is not available in .NET 3.5:

Big integers in C#
http://intx.codeplex.com/
https://svn.apache.org/repos/asf/incubator/heraldry/libraries/csharp/openid/trunk/Mono/Mono.Math/BigInteger.cs
http://www.codeproject.com/KB/cs/BigInteger_Library.aspx
http://www.codeproject.com/KB/cs/biginteger.aspx

Community
  • 1
  • 1
Paya
  • 5,124
  • 4
  • 45
  • 71
  • 1
    Can you explain where a `Base53` or `Base62` encoding could be of any use? – Frank Bollack Jul 16 '10 at 12:55
  • 4
    By the way `Base62` encoding is great if you want to convert byte array into string without any '/' and '+' similar symbols, just a-z, A-Z, 0-9. – Paya Jul 16 '10 at 13:28
  • 5 base 62 digits can encode a lot more than 2 bytes! – Nick Johnson Nov 23 '11 at 05:00
  • If you're still interested in this and it doesn't have to be universally mathematically portable, I'd suggest considering chunking. I've implemented the numeric div/mod work on uint64 arithmetic, converting 8 bytes at a time (produces 11 chars for base62, would need 10.75 chars, 2.3% overhead). Not as space-efficient, but almost, and way faster (have no comparison but there's no slow arbitrary-length integer involved). – ygoe Nov 09 '20 at 22:21

10 Answers10

14

A little late to the party, but...

Because your specification calls for an arbitrary number of bits, you must have an integer type that can work with an arbitrary number of bits. If you can't target .NET 4.0 you'll have to beg, borrow, or steal a BigInteger implementation somewhere (like .NET 4.0 perhaps).

public static class GenericBaseConverter
{
    public static string ConvertToString(byte[] valueAsArray, string digits, int pad)
    {
        if (digits == null)
            throw new ArgumentNullException("digits");
        if (digits.Length < 2)
            throw new ArgumentOutOfRangeException("digits", "Expected string with at least two digits");

        BigInteger value = new BigInteger(valueAsArray);
        bool isNeg = value < 0;
        value = isNeg ? -value : value;

        StringBuilder sb = new StringBuilder(pad + (isNeg ? 1 : 0));

        do
        {
            BigInteger rem;
            value = BigInteger.DivRem(value, digits.Length, out rem);
            sb.Append(digits[(int)rem]);
        } while (value > 0);

        // pad it
        if (sb.Length < pad)
            sb.Append(digits[0], pad - sb.Length);

        // if the number is negative, add the sign.
        if (isNeg)
            sb.Append('-');

        // reverse it
        for (int i = 0, j = sb.Length - 1; i < j; i++, j--)
        {
            char t = sb[i];
            sb[i] = sb[j];
            sb[j] = t;
        }

        return sb.ToString();

    }

    public static BigInteger ConvertFromString(string s, string digits)
    {
        BigInteger result;

        switch (Parse(s, digits, out result))
        {
            case ParseCode.FormatError:
                throw new FormatException("Input string was not in the correct format.");
            case ParseCode.NullString:
                throw new ArgumentNullException("s");
            case ParseCode.NullDigits:
                throw new ArgumentNullException("digits");
            case ParseCode.InsufficientDigits:
                throw new ArgumentOutOfRangeException("digits", "Expected string with at least two digits");
            case ParseCode.Overflow:
                throw new OverflowException();
        }

        return result;
    }

    public static bool TryConvertFromString(string s, string digits, out BigInteger result)
    {
        return Parse(s, digits, out result) == ParseCode.Success;
    }

    private enum ParseCode
    {
        Success,
        NullString,
        NullDigits,
        InsufficientDigits,
        Overflow,
        FormatError,
    }

    private static ParseCode Parse(string s, string digits, out BigInteger result)
    {
        result = 0;

        if (s == null)
            return ParseCode.NullString;
        if (digits == null)
            return ParseCode.NullDigits;
        if (digits.Length < 2)
            return ParseCode.InsufficientDigits;

        // skip leading white space
        int i = 0;
        while (i < s.Length && Char.IsWhiteSpace(s[i]))
            ++i;
        if (i >= s.Length)
            return ParseCode.FormatError;

        // get the sign if it's there.
        BigInteger sign = 1;
        if (s[i] == '+')
            ++i;
        else if (s[i] == '-')
        {
            ++i;
            sign = -1;
        }

        // Make sure there's at least one digit
        if (i >= s.Length)
            return ParseCode.FormatError;


        // Parse the digits.
        while (i < s.Length)
        {
            int n = digits.IndexOf(s[i]);
            if (n < 0)
                return ParseCode.FormatError;
            BigInteger oldResult = result;
            result = unchecked((result * digits.Length) + n);
            if (result < oldResult)
                return ParseCode.Overflow;

            ++i;
        }

        // skip trailing white space
        while (i < s.Length && Char.IsWhiteSpace(s[i]))
            ++i;

        // and make sure there's nothing else.
        if (i < s.Length)
            return ParseCode.FormatError;

        if (sign < 0)
            result = -result;

        return ParseCode.Success;
    }
}
Tergiver
  • 14,171
  • 3
  • 41
  • 68
  • +1 for code, but **apoorv020** first came with `BigInteger` solution. – Paya Jul 16 '10 at 16:13
  • Great code. Negative `BigInteger` case can be avoided by adding a null byte at the end to the byte array. For bases bigger than 256 it can result in a smaller string than a minus sign prefix. BigIntegers are negative if their last byte has 0x80 flag set. – tigrou Dec 14 '18 at 11:36
4

If performance is not an issue, use the BigInteger class in the background. You have a constructor for BigInteger that takes byte array, and you can then manually run loops of division and modulus to get the representation in other non-standard bases.

Also take a look at this.

apoorv020
  • 5,420
  • 11
  • 40
  • 63
  • Thanks, I totally forgot about the `BigInteger` class, that could solve the problem! Performance is not an issue as long as encoding 500 bytes of data doesn't take more than 5 seconds. – Paya Jul 16 '10 at 13:26
  • Argh, `BigInteger` is in `.NET 4.0`, but I need solution for `.NET 3.5`. :-( – Paya Jul 16 '10 at 13:34
  • What about the j# library mentioned in the 2nd link? – apoorv020 Jul 16 '10 at 13:37
  • I'm not actually into adding more dependencies into my project, so if I go with `BitInteger` solution, I will probably use some code that I can compile into my .exe, such as this [CodeProject implementation](http://www.codeproject.com/KB/cs/biginteger.aspx). Hovewer, +1, as `BigInteger` is actually able to solve this problem. And if nobody else suggests any other solution, I will stick to it and accept your answer. Thanks. – Paya Jul 16 '10 at 13:50
4

Here is a copy from my blog which I hope helps how (and why) I convert to Base62

I am currently working on my own url shortener: konv.es. In order to create the shortest possible character hash of the url, I use the GetHashCode() method of the string, then convert the resulting number to base 62 ([0-9a-zA-Z]). The most elegant solution that I have found thus far to make the convertion (which is also a handy-dandy example of a yield return) is:

public static IEnumerable<char> ToBase62(int number)
    {
        do
        {
            yield return "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"[number % 62];
            number /= 62;

        } while (number > 0);
    }

Extra credit: re-factor as an extension method

Steve Konves
  • 2,648
  • 3
  • 25
  • 44
  • Do you have a method to reverse this? – Mat Oct 15 '15 at 20:28
  • 1
    Using GetHashCode() is an unreliable method when the hashes are persisted. The hash code for a string can differ between x86 and x64 runtime and different versions of .NET. It's only guaranteed to give the same hash code at runtime. – Drakarah Sep 07 '17 at 10:30
2

BASE64 works well, because 64 is a power of 2 (2^6) so each character holds 6 bits of data, and 3 bytes (3 * 8 = 24 bits) can be encoded into 4 characters (4 * 6 = 24). The encoding & decoding can be down merely bit shifting bits.

For bases which do not align with a power of 2 (like your base 62 or Base 53), Then you must treat the message you are trying to encode as one long number and perform divison and modulo operations on it. You'd probably be better off using a Base32 encoding and squandering a bit of bandwidth.

James Curran
  • 101,701
  • 37
  • 181
  • 258
  • So there isn't any other solution than using the `BigInteger` class or something similar to that class? – Paya Jul 16 '10 at 13:58
1

You can get inspiration from C# implementation of Base32 implementation by Michael Giagnocavo.

Yakeen
  • 2,142
  • 1
  • 17
  • 21
  • I've already been through that code and there is a problem: both `Base64` and `Base32` can map directly to some number of bits, 6 in case of `Base64` and 3 in case of `Base32`, but for example `Base62` doesn't map to whole number of bits. So I have no idea how to convert that `Base32` implementation into universal base encoder. – Paya Jul 16 '10 at 13:23
  • @Paya I think you meant 5 bits for base-32 since 2⁵=32. – Kevin Li Feb 10 '16 at 03:46
0

A post on CodeReview prompted me to create a RadixEncoding class which is able to handle encoding/decoding a byte array to/from a base-N string.

The class can be found in this Q&A thread, along with documentation on (and solutions for) a few edge cases when dealing with BigInteger, endian-ness support, and the class' overall performance

Community
  • 1
  • 1
kornman00
  • 808
  • 10
  • 27
0

Another example to look at is Ascii85, used in Adobe PostScript and PDF documents. In Ascii85, 5 characters are used to encode 4 bytes. You can figure out the efficiency of this coding as (256^4)/(85^5) = 96.8%. This is the fraction of bit combinations that will actually be used.

So, for whatever new base you would want to use to encode your data, you want to look for a power that will get it just above a power of 256 if you're trying to maximize coding efficiency. This might not be easy for every base. Checking base 53 shows that the best you'll probably get is using 7 bytes to encode 5 bytes (93.6% efficiency), unless you feel like using 88 bytes to encode 63 bytes.

Justin
  • 6,611
  • 3
  • 36
  • 57
0

I've written an article which describes a solution in Python that exactly deals with your problem. I didn't use very special features of Python in order to get a solution which can easily be implemented in other languages. You might have a look and find out if it fits your needs.

Jonny Dee
  • 837
  • 4
  • 12
0

Here is the sample code snippet to convert byte array to base64. There is a very good article on this, I took reference from this.

public class Test {

    private static final char[] toBase64URL = {
            'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
            'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
            '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
    };

    public static void main(String[] args) {

        byte[] mess = "ABC123".getBytes();

        byte[] masks = { -128, 64, 32, 16, 8, 4, 2, 1 };
        StringBuilder builder = new StringBuilder();

        for(int i = 0; i < mess.length; i++) {
            for (byte m : masks) {
                if ((mess[i] & m) == m) {
                    builder.append('1');
                } else {
                    builder.append('0');
                }
            }
        }

        System.out.println(builder.toString());

        int i =0;
        StringBuilder output = new StringBuilder();
        while (i < builder.length()){
            String part = builder.substring(i, i+6);
            int index = Integer.parseInt(part, 2);
            output.append(toBase64URL[index]);
            i += 6;
        }

        System.out.println(output.toString());

    }
}
Hrishikesh Mishra
  • 3,295
  • 3
  • 27
  • 33
0

Inspired by Steve Konves's answer

using System.Numerics;

const string base62Chars = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
const string base26Chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";

static void Main() {
    string id = "xAQ0f58JgG";

    BigInteger i = fromBaseX(id, base62Chars);
    Console.WriteLine(i);

    string c = ToBaseX(i, base62Chars);
    Console.WriteLine(c);

    string c2 = ToBaseX(i, base26Chars);
    Console.WriteLine(c2);

    BigInteger i2 = fromBaseX(c2, base26Chars);
    Console.WriteLine(i2);
}

public static string ToBaseX(BigInteger number, string baseX)
{
    int l = baseX.Length;
    string result = "";
    while (number > 0)
    {
        BigInteger remainder = number % l;
        int index = (int)remainder;
        if (index >= l)
        {
            throw new ArgumentException($"Cannot convert {number} ToBaseX {baseX}");
        }
        result += baseX[index];
        number /= l;
    }
    return result;
}

public static BigInteger fromBaseX(string input, string baseX)
{
    int l = baseX.Length;
    BigInteger result;
    int pow = 0;
    foreach (char c in input)
    {
        int index = baseX.IndexOf(c);
        if (index < 0)
        {
            throw new ArgumentException($"Cannot convert {input} fromBaseX {baseX}");
        }
        BigInteger additions = BigInteger.Pow(l, pow) * index;
        result += additions;
        pow++;
    }
    return result;
}
BillWCJ
  • 1
  • 1