使用Java爬虫得到优快云博客信息并保存(一)

最新推荐文章于 2024-04-20 22:07:49 发布

原创最新推荐文章于 2024-04-20 22:07:49 发布 · 417 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#csdn #csdn首页 #java #java爬虫 #爬虫

java爬虫专栏收录该内容

1 篇文章

订阅专栏

本文介绍如何使用Java实现爬虫，抓取优快云博客信息，并将其存储为本地TXT文件。

需求：

上一篇文章已经可以得到博客中的信息，这篇博客将示范把信息以txt文件的形式保存到本地

代码实现：

package cn.test12.WebRobot04;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.net.URLConnection;


/*
 * 创建对象的时候指明保存的父位置、和子位置
 */
public class SaveWeb {
	//这是后面创建txt文件的名字
	String webName;
	//这是后面保存这个链接所有东西的的子目录和保存文本的txt文件对象，创建完对象后，就初始化完成了
	File myFile;
	File txtFile;
	BufferedWriter txtBw;
	public SaveWeb(String parentFile, String bloger) {
		this.webName = bloger;
		this.myFile = new File(parentFile,bloger);
		if(!this.myFile.exists()){
			this.myFile.mkdirs();
		}
		this.txtFile = new File(this.myFile,bloger+".txt");
		try {
			//得到这个txt文件的高效写入流，后面的方法都可以直接用
			txtBw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(txtFile),"utf-8"));
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	public void saveImg(String imgUrl) throws Exception{
		// 1.创建链接对象
		URL url = new URL(imgUrl);
		// 2.创建根据链接对象写出通道对象
		URLConnection uc = url.openConnection();
		uc.setRequestProperty("User-Agent",
				"Mozilla/5.0 (Windows NT 6.3; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0");
		// 3.根据通道对象，得到字节流的读的方法
		InputStream fis = uc.getInputStream();
		String imgName = imgUrl.substring(29);
		File file = new File(myFile, imgName);
		// 5.开始保存
		FileOutputStream fos = new FileOutputStream(file);
		byte[] bys = new byte[1024];
		int len = -1;
		while ((len = fis.read(bys)) != -1) {
			fos.write(bys, 0, len);
		}
		// 释放资源
		fos.close();
		fis.close();
	}
	public void saveWebName(String webNameIntxt) throws IOException{
		txtBw.newLine();
		txtBw.append("--------"+webNameIntxt+"-----------");
		txtBw.newLine();
		txtBw.flush();
	}
	public void saveVisit(String[] visits) throws Exception{
		int i = 1;
		txtBw.newLine();
		txtBw.append("---------访问量---------");
		txtBw.newLine();
		for(String visit:visits){
			txtBw.append(i+++"."+visit);
			txtBw.newLine();
			txtBw.flush();
		}
	}
	public void saveTitle(String[] titles) throws IOException{
		int i = 1;
		txtBw.newLine();
		txtBw.append("----------标题-----------");
		txtBw.newLine();
		for(String title:titles){
			txtBw.append(i+++"."+title);
			txtBw.newLine();
			txtBw.flush();
		}
	}
	public void saveAllHtml(String line) throws Exception{
		File htmlFile = new File(this.myFile,webName+".html");
		BufferedWriter htmlBw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(htmlFile),"utf-8"));
		htmlBw.write(line);
		htmlBw.close();
	}
	public void close() throws Exception{
		this.txtBw.close();
	}
}

3.下面是个测试的小例子，会把得到的信息都保存到d:\webDown下面

package cn.test12.WebRobot04;

import org.junit.Test;

/*
 * 通过现有的几个类创建得到csdn主页所有出现的博客，并把每一个博客中的所有需要的内容都下载下来
 */
public class GetCsdnTest02 {
	public static void main(String[] args) throws Exception {
		String[] blogHomePage = new GetCsdn("http://blog.youkuaiyun.com").getBlogHomePage();
		for(String blog:blogHomePage){
			try{
				function(blog);
				System.out.println(blog+"----下载完成");
			}catch(Exception e){
				System.out.println(blog+"----下载失败");
			}
		}
	}
}

测试结果：