Here's my optimization in x86 assembly language (32 bit). You can compile with NASM and link. I don't know if it's fast or slow, I just had fun with coding :)
global compress_unsigned_int
; bit fields:
; 31 0
; eeeedddddddcccccccbbbbbbbaaaaaaa
compress_unsigned_int:
mov eax, [esp+4] ; n
mov ecx, [esp+8] ; data
cmp eax, 00001111111111111111111111111111b
jbe out4b
shld edx, eax, 11
shl eax, 10
shld edx, eax, 8
shl eax, 7
shld edx, eax, 8
shl eax, 7
shld edx, eax, 8
or edx, 10000000100000001000000010000000b
mov [ecx], edx
mov eax, [esp+4]
shr eax, 28
mov [ecx+4], al
mov eax, 5
jmp exit
out4b:
cmp eax, 00000000000111111111111111111111b
jbe out3b
shld edx, eax, 11
shl eax, 10
shld edx, eax, 8
shl eax, 7
shld edx, eax, 8
shl eax, 7
shld edx, eax, 8
or edx, 00000000100000001000000010000000b
mov [ecx], edx
mov eax, 4
jmp exit
out3b:
cmp eax, 00000000000000000011111111111111b
jbe out2b
shld edx, eax, 25
shl eax, 24
shld edx, eax, 8
mov eax, edx
or edx, 00000000000000001000000010000000b
mov [ecx], dx
shr eax, 15
mov [ecx+2], al
mov eax, 3
jmp exit
out2b:
cmp eax, 00000000000000000000000001111111b
jbe out1b
shld edx, eax, 25
shl eax, 24
shld edx, eax, 8
or edx, 00000000000000000000000010000000b
mov [ecx], dx
mov eax, 2
jmp exit
out1b:
mov [ecx], al
mov eax, 1
exit:
ret