package tools;
import java.io.*;
import java.io.BufferedInputStream;
import java.nio.charset.Charset;
import java.util.Map;
/**
* <p>Title: </p>
* <p>Description: </p>
* <p>Copyright: Copyright (c) 2006</p>
* <p>Company: </p>
* @author yezh
* @version 1.0
*/
//该类作用读取分割后的天网语料,不管网页是什么都可以
//但当网页中没有charset时,都默认我gbk编码,所以可能也有错误
public class ReadHtml {
private static int num = 40;//charset可能在行数,否则认为没有,默认gbk
private static Map map = Charset.availableCharsets();
public static String ReadAllKindOfCharset(String sfile)
{
String myfile = sfile;
File file = new File(myfile);
return ReadAllKindOfCharset(file);
}
public static String ReadAllKindOfCharset(File file)
{
FileInputStream instream;
String charset = "gbk";
int c;
InputStreamReader reader;
BufferedInputStream bin;
BufferedReader br;
StringBuffer buffer = new StringBuffer(4096);//开始用来寻找charset,然后用来缓存的文件
try {
instream = new FileInputStream(file);
bin = new BufferedInputStream(instream);
bin.mark(bin.available());
for(int i =0; i< num && bin.available() > 0; i++)
{
buffer.setLength(0);
while((c = bin.read()) != -1 && c!=10 && c!= 13)
{
buffer.append((char) c);
//System.out.print((char) c);
}
if(c == 13)//对windows换行的处理
{
bin.read();
}
String line = buffer.toString().toLowerCase();
int pos = line.indexOf("charset");
if( pos != -1)
{
StringBuffer charsetbuffer = new StringBuffer();
for( pos =pos +7; pos < line.length(); pos++)
{
char ch = line.charAt(pos);
if( ch== '=')
{
}
else if(ch == '"' || ch == '\'' || ch == ';' )
{
break;
}
else
{
charsetbuffer.append(ch);
}
}
String tempcharset = charsetbuffer.toString().trim();
if(tempcharset.length()>2)
{
charset = tempcharset;
}
//System.out.println("charset: " + tempcharset);
break;
}
if (c == -1) {
break;
}
}
if (!map.containsKey(charset) || !map.containsValue(charset)) { //不支持时可以在此处理
charset = "gbk";
}
bin.reset();
String line = buffer.toString();
buffer.setLength(0);
//instream = new FileInputStream(file);
//bin = new BufferedInputStream(instream);
reader = new InputStreamReader(bin, charset);
br = new BufferedReader(reader);
while((line = br.readLine()) != null)
{
buffer.append(line + '\n');
}
}
catch (Exception ex) {
System.out.println("读取文件出错");
System.out.println(charset);
}
//System.out.println(buffer.toString());
return buffer.toString();
}
//
public static void processDirectory(String Directory)
{
File dir = new File(Directory);
if(dir.isDirectory())
{
File files[] = dir.listFiles();
for(int i = 0; i < files.length; i++)
{
if(files[i].isFile())
{
ReadAllKindOfCharset(files[i]);
}
else if(files[i].isDirectory())
{
processDirectory(files[i].getName());
}
}
}
}
//测试
public static void main(String args[])
{
long start = System.currentTimeMillis();
System.out.println(ReadHtml.ReadAllKindOfCharset(("luan/arrow.com.cnpdadianyingdefault.asp.htm")));
//ReadHtml.ReadAllKindOfCharset(("luan/编辑1.htm"));
//ReadHtml.processDirectory("D:\\javap\\天网\\myJava\\out");
System.out.println("time = " + (System.currentTimeMillis() - start));
if(Charset.isSupported("gb_2312-80"))
{
System.out.println("true");
System.out.println("·");
}
}
}
import java.io.*;
import java.io.BufferedInputStream;
import java.nio.charset.Charset;
import java.util.Map;
/**
* <p>Title: </p>
* <p>Description: </p>
* <p>Copyright: Copyright (c) 2006</p>
* <p>Company: </p>
* @author yezh
* @version 1.0
*/
//该类作用读取分割后的天网语料,不管网页是什么都可以
//但当网页中没有charset时,都默认我gbk编码,所以可能也有错误
public class ReadHtml {
private static int num = 40;//charset可能在行数,否则认为没有,默认gbk
private static Map map = Charset.availableCharsets();
public static String ReadAllKindOfCharset(String sfile)
{
String myfile = sfile;
File file = new File(myfile);
return ReadAllKindOfCharset(file);
}
public static String ReadAllKindOfCharset(File file)
{
FileInputStream instream;
String charset = "gbk";
int c;
InputStreamReader reader;
BufferedInputStream bin;
BufferedReader br;
StringBuffer buffer = new StringBuffer(4096);//开始用来寻找charset,然后用来缓存的文件
try {
instream = new FileInputStream(file);
bin = new BufferedInputStream(instream);
bin.mark(bin.available());
for(int i =0; i< num && bin.available() > 0; i++)
{
buffer.setLength(0);
while((c = bin.read()) != -1 && c!=10 && c!= 13)
{
buffer.append((char) c);
//System.out.print((char) c);
}
if(c == 13)//对windows换行的处理
{
bin.read();
}
String line = buffer.toString().toLowerCase();
int pos = line.indexOf("charset");
if( pos != -1)
{
StringBuffer charsetbuffer = new StringBuffer();
for( pos =pos +7; pos < line.length(); pos++)
{
char ch = line.charAt(pos);
if( ch== '=')
{
}
else if(ch == '"' || ch == '\'' || ch == ';' )
{
break;
}
else
{
charsetbuffer.append(ch);
}
}
String tempcharset = charsetbuffer.toString().trim();
if(tempcharset.length()>2)
{
charset = tempcharset;
}
//System.out.println("charset: " + tempcharset);
break;
}
if (c == -1) {
break;
}
}
if (!map.containsKey(charset) || !map.containsValue(charset)) { //不支持时可以在此处理
charset = "gbk";
}
bin.reset();
String line = buffer.toString();
buffer.setLength(0);
//instream = new FileInputStream(file);
//bin = new BufferedInputStream(instream);
reader = new InputStreamReader(bin, charset);
br = new BufferedReader(reader);
while((line = br.readLine()) != null)
{
buffer.append(line + '\n');
}
}
catch (Exception ex) {
System.out.println("读取文件出错");
System.out.println(charset);
}
//System.out.println(buffer.toString());
return buffer.toString();
}
//
public static void processDirectory(String Directory)
{
File dir = new File(Directory);
if(dir.isDirectory())
{
File files[] = dir.listFiles();
for(int i = 0; i < files.length; i++)
{
if(files[i].isFile())
{
ReadAllKindOfCharset(files[i]);
}
else if(files[i].isDirectory())
{
processDirectory(files[i].getName());
}
}
}
}
//测试
public static void main(String args[])
{
long start = System.currentTimeMillis();
System.out.println(ReadHtml.ReadAllKindOfCharset(("luan/arrow.com.cnpdadianyingdefault.asp.htm")));
//ReadHtml.ReadAllKindOfCharset(("luan/编辑1.htm"));
//ReadHtml.processDirectory("D:\\javap\\天网\\myJava\\out");
System.out.println("time = " + (System.currentTimeMillis() - start));
if(Charset.isSupported("gb_2312-80"))
{
System.out.println("true");
System.out.println("·");
}
}
}
本文介绍了一个Java类,用于解析HTML文件并自动检测其字符集编码。该类通过搜索文件头部来确定正确的编码方式,以便正确读取和处理不同编码的网页内容。
803

被折叠的 条评论
为什么被折叠?



