when the input hex string is very big (such as 1000000 length)
Actually, 1 meg isn't all that long for today's computers.
If you need to be able to handle bigger strings (think 10s of gigabytes), or even just a LOT of 1 meg strings, you can play with the SSE functions. While it will work for more modest requirements, the added complexity may not be worth the performance gain.
I'm on Windows, so I'm building with MSVC 2019. x64, optimizations enabled, and arch:AVX2.
#define _CRT_SECURE_NO_WARNINGS
typedef unsigned char BYTE;
#include <stdio.h>
#include <memory.h>
#include <intrin.h>
#include <immintrin.h>
#include <stdint.h>
static const uint_fast8_t LOOKUP[256] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f };
void HexString2Bytes(/*IN*/ const char* hexstr, /*OUT*/ BYTE* dst)
{
for (size_t i = 0; hexstr[i] != '\0'; i += 2)
{
*dst = LOOKUP[hexstr[i]] << 4 |
LOOKUP[hexstr[i + 1]];
dst++;
}
}
void HexString2BytesSSE(const char* ptrin, char *ptrout, size_t bytes)
{
register const __m256i mmZeros = _mm256_set1_epi64x(0x3030303030303030ll);
register const __m256i mmNines = _mm256_set1_epi64x(0x0909090909090909ll);
register const __m256i mmSevens = _mm256_set1_epi64x(0x0707070707070707ll);
register const __m256i mmShuffle = _mm256_set_epi64x(-1, 0x0f0d0b0907050301, -1, 0x0f0d0b0907050301);
//============
const __m256i* in = (const __m256i*)ptrin;
__m128i* out = (__m128i *)ptrout;
size_t lines = bytes / 32;
for (size_t x = 0; x < lines; x++)
{
// Read 32 bytes
__m256i AllBytes = _mm256_load_si256(in);
// subtract '0' from every byte
AllBytes = _mm256_sub_epi8(AllBytes, mmZeros);
// Look for bytes that are 'A' or greater
const __m256i mask = _mm256_cmpgt_epi8(AllBytes, mmNines);
// Assign 7 to every byte greater than 'A'
const __m256i maskedvalues = _mm256_and_si256(mask, mmSevens);
// Subtract 7 from every byte greater than 'A'
AllBytes = _mm256_sub_epi8(AllBytes, maskedvalues);
// At this point, every byte in AllBytes represents a nibble, with
// the even bytes being the upper nibble.
// Make a copy and shift it left 4 bits to shift the nibble, plus
// 8 bits to align the nibbles.
__m256i UpperNibbles = _mm256_slli_epi64(AllBytes, 4 + 8);
// Combine the nibbles
AllBytes = _mm256_or_si256(AllBytes, UpperNibbles);
// At this point, the odd numbered bytes in AllBytes is the output we want.
// Move the bytes to be contiguous. Note that you can only move
// bytes within their 128bit lane.
const __m256i ymm1 = _mm256_shuffle_epi8(AllBytes, mmShuffle);
// Move the bytes from the upper lane down next to the lower.
const __m256i ymm2 = _mm256_permute4x64_epi64(ymm1, 8);
// Pull out the lowest 16 bytes
*out = _mm256_extracti128_si256(ymm2, 0);
in++;
out++;
}
}
int main()
{
FILE* f = fopen("test.txt", "rb");
fseek(f, 0, SEEK_END);
size_t fsize = _ftelli64(f);
rewind(f);
// HexString2Bytes requires trailing null
char* InBuff = (char* )_aligned_malloc(fsize + 1, 32);
size_t t = fread(InBuff, 1, fsize, f);
fclose(f);
InBuff[fsize] = 0;
char* OutBuff = (char*)malloc(fsize / 2);
char* OutBuff2 = nullptr;
putchar('A');
for (int x = 0; x < 16; x++)
{
HexString2BytesSSE(InBuff, OutBuff, fsize);
#if 0
if (OutBuff2 == nullptr)
{
OutBuff2 = (char*)malloc(fsize / 2);
}
HexString2Bytes(InBuff, (BYTE*)OutBuff2);
if (memcmp(OutBuff, OutBuff2, fsize / 32) != 0)
printf("oops\n");
putchar('.');
#endif
}
putchar('B');
if (OutBuff2 != nullptr)
free(OutBuff2);
free(OutBuff);
_aligned_free(InBuff);
}
A couple of things to notice:
- There is no error handling here. I don't check for out of memory, or file read errors. I don't even check for invalid characters in the input stream or lower case hex digits.
- This code assumes that the size of the string is available without having to walk the string (ftelli64 in this case). If you need to walk the string byte-by-byte to get its length (a la strlen), you've probably lost the benefit here.
- I've kept HexString2Bytes, so you can compare the outputs from my code vs yours to makes sure I'm converting correctly.
- HexString2BytesSSE assumes the number of bytes in the string is evenly divisible by 32 (a questionable assumption). However, reworking it to call HexString2Bytes for the last (at most) 31 bytes is pretty trivial, and isn't going to impact performance much.
- My test.txt is 2 gigs long, and this code runs it 16 times. That's about what I need for the differences to become readily visible.
For people who want to kibitz (because of course you do), here's the assembler output for the innermost loop along with some comments:
10F0 lea rax,[rax+10h] ; Output pointer
10F4 vmovdqu ymm0,ymmword ptr [rcx] ; Input data
10F8 lea rcx,[rcx+20h]
; Convert characters to nibbles
10FC vpsubb ymm2,ymm0,ymm4 ; Subtract 0x30 from all characters
1100 vpcmpgtb ymm1,ymm2,ymm5 ; Find all characters 'A' and greater
1104 vpand ymm0,ymm1,ymm6 ; Prepare to subtract 7 from all the 'A'
1108 vpsubb ymm2,ymm2,ymm0 ; Adjust all the 'A'
; Combine the nibbles to form bytes
110C vpsllq ymm1,ymm2,0Ch ; Shift nibble up + align nibbles
1111 vpor ymm0,ymm1,ymm2 ; Combine lower and upper nibbles
; Coalesce the odd numbered bytes
1115 vpshufb ymm2,ymm0,ymm7
; Since vpshufb can't cross lanes, use vpermq to
; put all 16 bytes together
111A vpermq ymm3,ymm2,8
1120 vmovdqu xmmword ptr [rax-10h],xmm3
1125 sub rdx,1
1129 jne main+0F0h (10F0h)
While your final code is almost certainly sufficient for your needs, I thought this might be interesting for you (or future SO users).