Well, I don’t know if this is the best solution, but we can test the file against various CharsetDecoders and see if any of them reports no errors. Here is a class implementing this behaviour (note: the code below will open and read the file and test it against the decoder until EOF is reached – if an error occurs it proceeds to the next decoder etc. – so if you specify a great number of charsets to be tested, or test large files, it will be slow ) :
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
|
import
java.io.BufferedInputStream;
import
java.io.File;
import
java.io.FileInputStream;
import
java.io.FileNotFoundException;
import
java.io.IOException;
import
java.io.InputStreamReader;
import
java.nio.ByteBuffer;
import
java.nio.charset.CharacterCodingException;
import
java.nio.charset.Charset;
import
java.nio.charset.CharsetDecoder;
/**
*
* @author Georgios Migdos
*/
public
class
CharsetDetector {
public
Charset detectCharset(File f, String[] charsets) {
Charset charset =
null
;
for
(String charsetName : charsets) {
charset = detectCharset(f, Charset.forName(charsetName));
if
(charset !=
null
) {
break
;
}
}
return
charset;
}
private
Charset detectCharset(File f, Charset charset) {
try
{
BufferedInputStream input =
new
BufferedInputStream(
new
FileInputStream(f));
CharsetDecoder decoder = charset.newDecoder();
decoder.reset();
byte
[] buffer =
new
byte
[
512
];
boolean
identified =
false
;
while
((input.read(buffer) != -
1
) && (!identified)) {
identified = identify(buffer, decoder);
}
input.close();
if
(identified) {
return
charset;
}
else
{
return
null
;
}
}
catch
(Exception e) {
return
null
;
}
}
private
boolean
identify(
byte
[] bytes, CharsetDecoder decoder) {
try
{
decoder.decode(ByteBuffer.wrap(bytes));
}
catch
(CharacterCodingException e) {
return
false
;
}
return
true
;
}
public
static
void
main(String[] args) {
File f =
new
File(
"example.txt"
);
String[] charsetsToBeTested = {
"UTF-8"
,
"windows-1253"
,
"ISO-8859-7","GBK"
};
CharsetDetector cd =
new
CharsetDetector();
Charset charset = cd.detectCharset(f, charsetsToBeTested);
if
(charset !=
null
) {
try
{
InputStreamReader reader =
new
InputStreamReader(
new
FileInputStream(f), charset);
int
c =
0
;
while
((c = reader.read()) != -
1
) {
System.out.print((
char
)c);
}
reader.close();
}
catch
(FileNotFoundException fnfe) {
fnfe.printStackTrace();
}
catch
(IOException ioe){
ioe.printStackTrace();
}
}
else
{
System.out.println(
"Unrecognized charset."
);
}
}
|