- package readImgUrl;
- import java.io.BufferedInputStream;
- import java.io.BufferedReader;
- import java.io.BufferedWriter;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileOutputStream;
- import java.io.FileReader;
- import java.io.FileWriter;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.io.OutputStreamWriter;
- import java.net.URL;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.Collections;
- import java.util.Comparator;
- import java.util.List;
- public class ClassifyUrl {
- private static int HASHLEN = 100;
- private static String file_dir = "D:\\学习\\实验室项目\\ImageNet图片爬取\\classify_url\\";
- private static String src_file = "D:\\学习\\实验室项目\\ImageNet图片爬取\\fall11_urls.txt";
- public static void main(String[] args) throws Exception {
- // TODO Auto-generated method stub
- classify_url("D:\\学习\\实验室项目\\ImageNet图片爬取\\fall11_urls.txt");
- // rank_filedata("2");
- // String s = judgeFileCode(src_file);
- // String s = codeString(src_file);
- // System.out.println(s);
- }
- /**
- * 对一个文件进行排序
- */
- public static void rank_filedata(String filename){
- String path1 = file_dir+filename+".txt";
- String path2 = file_dir+filename+"_"+".txt";
- List<String> list = reader_list(path1);
- System.out.println(list.size());
- // 排序,通过泛型和匿名类来实现
- Collections.sort(list, new Comparator<String>() {
- public int compare(String s1, String s2) {
- String h1 = s1.split(" ")[1];
- String h2 = s2.split(" ")[1];
- return h1.compareTo(h2);
- }
- });
- writer_list(list, path2);
- }
- /**
- * 读取文件,返回list
- * @param path
- * @return
- */
- public static List reader_list(String path){
- List<String> lineList = new ArrayList();
- try {
- BufferedReader reader = new BufferedReader(new FileReader(path));
- String line = reader.readLine();
- while(null != line){
- lineList.add(line);
- line = reader.readLine();
- }
- reader.close();
- return lineList;
- } catch (Exception e) {
- // TODO: handle exception
- e.printStackTrace();
- }
- return null;
- }
- /**
- * 将List写入文件
- * @param line
- */
- public static void writer_list(List list, String path){
- try {
- BufferedWriter writer = new BufferedWriter(new FileWriter(path));
- for(int i=0; i<list.size(); i++){
- String line = (String)list.get(i);
- writer.write(line+"\r\n");
- }
- writer.close();
- } catch (Exception e) {
- // TODO: handle exception
- e.printStackTrace();
- }
- }
- /**
- * 从文件中逐行读取数据,分类写入0-99个文件
- */
- public static void classify_url(String path){
- try {
- BufferedReader reader ;
- String filecode = judgeFileCode(path);
- reader = new BufferedReader(new InputStreamReader(new FileInputStream(path),filecode));
- // BufferedReader reader = new BufferedReader(new FileReader(path));
- String line = reader.readLine();
- int line_num = 0;
- // while(line_num<4101000){
- // reader.readLine();
- // line_num++;
- // }
- while(null != line){
- try {
- String host = new URL(line.split(" ")[1]).getHost();
- int type = hash(host.toCharArray());
- // writer(type+"", line);
- } catch (Exception e) {
- // TODO: handle exception
- e.printStackTrace();
- }
- line = reader.readLine();
- line_num++;
- if(line_num%100==0){
- // System.out.println(line_num);
- char [] cc = line.toCharArray();
- for(char c: cc){
- if(isCnorEn(c)){
- System.out.println(line);
- break;
- }
- }
- // break;
- }
- }
- reader.close();
- } catch (Exception e) {
- // TODO: handle exception
- e.printStackTrace();
- }
- }
- /**
- * 判断是中文还是英文字符
- */
- static boolean isCnorEn(char c) {
- if ((c >= 0x0391 && c <= 0xFFE5) // 中文字符
- || (c >= 0x0000 && c <= 0x00FF)) // 英文字符
- return true;
- return false;
- // if ((c >= 0x0391 && c <= 0xFFE5) // 英文字符
- // ) //
- // return true;
- // return false;
- }
- /**
- * 给定一个字符串,返回hash后的int值
- * @param word
- * @return
- */
- public static int hash(char[] word) {
- int index = 0;
- int i=0;
- while(i<word.length) {
- index += index * 31 + word[i];
- i++;
- }
- return Math.abs(index % HASHLEN);
- }
- /**
- * 将line写入filename中(文件不存在则先建立)
- * @param filename
- * @param line
- */
- public static void writer(String filename, String line){
- String path = file_dir+filename+".txt";
- try {
- File file = new File(path);
- if(!file.isFile()){
- file.createNewFile();
- }
- String filecode = judgeFileCode(src_file);
- OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(path, true), "GBK");
- // BufferedWriter writer = new BufferedWriter(new FileWriter(path, true));
- if(null != line){
- writer.write(line+"\r\n");
- }
- writer.close();
- } catch (Exception e) {
- // TODO: handle exception
- e.printStackTrace();
- }
- }
- public static String judgeFileCode(String path){
- try {
- File file = new File(path);
- InputStream in= new java.io.FileInputStream(file);
- byte[] b = new byte[3];
- in.read(b);
- in.close();
- if (b[0] == -17 && b[1] == -69 && b[2] == -65) {
- // System.out.println(file.getName() + ":编码为UTF-8");
- return "UTF-8";
- }
- else{
- // System.out.println(file.getName() + ":可能是GBK,也可能是其他编码");
- return "GBK";
- }
- } catch (Exception e) {
- // TODO: handle exception
- }
- return null;
- }
- /**
- * 判断文件的编码格式
- * @param fileName :file
- * @return 文件编码格式
- * @throws Exception
- */
- public static String codeString(String fileName) throws Exception{
- BufferedInputStream bin = new BufferedInputStream(new FileInputStream(fileName));
- int p = (bin.read() << 8) + bin.read();
- String code = null;
- //其中的 0xefbb、0xfffe、0xfeff、0x5c75这些都是这个文件的前面两个字节的16进制数
- switch (p) {
- case 0xefbb:
- code = "UTF-8";
- break;
- case 0xfffe:
- code = "Unicode";
- break;
- case 0xfeff:
- code = "UTF-16BE";
- break;
- case 0x5c75:
- code = "ANSI|ASCII" ;
- break ;
- default:
- code = "GBK";
- }
- return code;
- }
- }
使用hash拆分文件
最新推荐文章于 2023-11-28 14:29:14 发布