Detect the encoding/Charset used in a Java InputStream by reading the BOM (Byte-Order-Mark)
/**
* Detect the encoding/Charset used in a InputStream by reading the BOM (Byte-Order-Mark)
* @author Mattias Jiderhamn
*/
public class EncodingDetectingInputStream extends PushbackInputStream {
/** Known Charsets and their BOMs*/
private static final Map<Charset, int[]> BOM_CHARSET;
/** Length of the longest known BOM */
private static final int BOM_MAX_LENGTH;
static {
// Define known BOMS
Map<Charset, int[]> bomCharset = new HashMap<Charset, int[]>();
bomCharset.put(Charset.forName("UTF-16BE"), new int[] {0xFE, 0xFF});
bomCharset.put(Charset.forName("UTF-16LE"), new int[] {0xFF, 0xFE});
bomCharset.put(Charset.forName("UTF-8"), new int[] {0xEF, 0xBB, 0xBF});
// Additional BOMs - add as needed
// {0x00, 0x00, 0xFE, 0xFF},
// {0xFF, 0xFE, 0x00, 0x00},
// {0x2B, 0x2F, 0x76, 0x38},
// {0x2B, 0x2F, 0x76, 0x39},
// {0x2B, 0x2F, 0x76, 0x2B},
// {0x2B, 0x2F, 0x76, 0x2F},
// {0xDD, 0x73, 0x66, 0x73},
// {0x0E, 0xFE, 0xFF},
// {0xFB, 0xEE, 0x28},
BOM_CHARSET = Collections.unmodifiableMap(bomCharset);
int bomMaxLength = 0;
for(int[] bom : BOM_CHARSET.values())
bomMaxLength = Math.max(bomMaxLength, bom.length);
BOM_MAX_LENGTH = bomMaxLength;
}
/** Test if a given BOM exists in the buffer */
private static boolean testForBOM(int[] bom, int[] bytes) {
for (int index = 0; index < bom.length; index++) {
if (bom[index] != bytes[index])
return false;
}
return true;
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/** The Charset detected in the InputStream of this instance */
private Charset detectedCharset;
/** Get detected charset */
public Charset getCharset() {
return detectedCharset;
}
/** Constructor that will automatically detect the encoding / Charset */
public EncodingDetectingInputStream(InputStream is) throws IOException {
super(is, BOM_MAX_LENGTH);
// Read the beginning of the stream - potentially containing a BOM
final int buffer[] = new int[BOM_MAX_LENGTH];
for(int i = 0; i < BOM_MAX_LENGTH; i++) {
int r = read();
if(r == -1)
break;
else
buffer[i] = r;
}
// Detect if BOM is present
int bomLength = 0;
for(Charset charset : BOM_CHARSET.keySet()) {
final int[] bom = BOM_CHARSET.get(charset);
if(testForBOM(bom, buffer)) {
this.detectedCharset = charset;
bomLength = bom.length;
}
}
// Push back bytes read that were not part of BOM
for (int index = buffer.length - 1; index >= bomLength; index--) {
if (buffer[index] != -1)
unread(buffer[index]);
}
}
}
Fork
0 Feedback
You must log in before you can give any feedback
You must log in before you can post a comment


1.12k
0




Mark 'charset' tag as 'like'
Mark 'charset' tag as 'ignore'