I am trying to read a file that is encoded using Shift_JIS and then convert it into UTF-8. When i use java.nio CharsetDecoder.decode it throws the following error. I am not able to pinpoint the actual cause of this issue.
java.nio.charset.UnmappableCharacterException: Input length = 2
java.nio.charset.UnmappableCharacterException: Input length = 2
at java.nio.charset.CoderResult.throwException(CoderResult.java:278)
at java.nio.charset.CharsetDecoder.decode(CharsetDecoder.java:798)
at CharacterSetConversionUtility.getString(CharacterSetConversionUtility.java:23)
at CharacterSetConversionUtility.convertBetweenEncodings(CharacterSetConversionUtility.java:39)
at CharacterSetConversionUtility.main(CharacterSetConversionUtility.java:94
Below is the code snippet
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.charset.CharsetDecoder;
import org.mozilla.universalchardet.UniversalDetector;
public class CharacterSetConversionUtility
{
public static String getString(String charSet, byte[] bytes) throws CharacterCodingException
{
ByteBuffer buffer = ByteBuffer.wrap(bytes);
Charset charset = Charset.forName(charSet);
CharsetDecoder decoder = charset.newDecoder();
CharBuffer output = decoder.decode(buffer);
return output.toString();
}
public static byte[] convertToEncoding(String charSet, String input) throws CharacterCodingException
{
CharBuffer buffer = CharBuffer.wrap(input);
Charset charset = Charset.forName(charSet);
CharsetEncoder encoder = charset.newEncoder();
ByteBuffer output = encoder.encode(buffer);
return output.array();
}
public static byte[] convertBetweenEncodings(byte[] originalBytes, String sourceCharSet, String destCharSet)
throws CharacterCodingException
{
String unicodeString = getString(sourceCharSet, originalBytes);
byte[] output = convertToEncoding(destCharSet, unicodeString);
return output;
}
/** Utility Method to detect character encoding in a byte stream **/
public static String getCharacterEncoding(String fileName){
byte[] buf = new byte[4096];
String encoding = null;
try {
java.io.FileInputStream fis = new java.io.FileInputStream(fileName);
// (1)
UniversalDetector detector = new UniversalDetector(null);
// (2)
int nread;
while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
detector.handleData(buf, 0, nread);
}
// (3)
detector.dataEnd();
// (4)
encoding = detector.getDetectedCharset();
if (encoding != null) {
System.out.println("Detected encoding = " + encoding);
} else {
System.out.println("No encoding detected.");
}
// (5)
detector.reset();
//
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return encoding;
}
public static void main(String[] args) {
Path path = Paths.get("E:/Encoding Issue/SJISFile");
try {
byte[] inputdata = Files.readAllBytes(path);
//Detect the character encoding of the input data
String inputCharEncoding = getCharacterEncoding("E:/Encoding Issue/SJISFile");
//Perform a character set conversion
byte[] outputdata =convertBetweenEncodings(inputdata,inputCharEncoding,"UTF-8");
FileOutputStream fos = new FileOutputStream("E:/Encoding Issue/convertedutf8.txt");
fos.write(outputdata);
fos.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}