网页爬虫小知识爬了指定网站的网页代码简单易懂

最新推荐文章于 2023-10-30 09:30:00 发布

原创最新推荐文章于 2023-10-30 09:30:00 发布 · 875 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#爬虫爬网页

网页爬虫小知识专栏收录该内容

1 篇文章

订阅专栏

本文介绍了一种使用栈来存储和管理待爬取网页链接的方法，并通过具体代码示例展示了如何爬取这些网页并将内容保存到本地。此外，还提供了一个简单的栈类实现。

<p>  我写了一个栈储存网页地址，然后再爬取栈内网页并存储到指定位置，这里贴出 主类：  </p><div>
</div><pre name="code" class="java">package friday;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;

public class MainPanel {
	static FileOutputStream out = null;
	static PrintStream p = null;
	static int htmlname=0;
	public static void main(String[] args) {
		
		MyStack stack=new MyStack(10);
		stack.push("http://www.baidu.com");
		stack.push("http://www.sina.com");
		stack.push("http://baike.baidu.com/link?url=vSzZ7v6e2v9s6ZoFBXpDjc6VVJr7Hnb93KC-oph-X2GhVf8C29yvSpOCHpUhwwxJK1_ndDr0a2tIowUT1AGbu_");
		stack.push("http://www.mafengwo.cn/travel-scenic-spot/mafengwo/11058.html");
		
		
		long startTime=System.currentTimeMillis();
		while(!stack.isEmpty())
		{
			htmlname++;
		String readyString=stack.pop();
		String pagetxt=getPageText(readyString);
		CrateFile(pagetxt,htmlname+".html");
		
		}
		long endTime=System.currentTimeMillis();
		long  useTime=endTime-startTime;
		System.out.println("爬取这几个网页用时为"+useTime+"毫秒\n,即"+useTime/1000+"秒");
	}
	public  static void CrateFile(String mes_Html,String pathname) {
		final String path = "D:/网页爬虫-爬爬爬1.4";  
		File file = new File(path);
		if (!file.exists()&& !file.isDirectory()) {
			file.mkdir();   
			System.out.println("不存在此文件夹 重建");
		}  
//		else
//		{
//			file.delete();  //删除重建file
//			file.mkdir();
//		}
		//final String filename = "smallBomm.txt";
		File textFile = new File(file, pathname);  //该目录下创建一个文件
		if (!textFile.exists()) {
			try {
				textFile.createNewFile();
				out = new FileOutputStream(textFile);
				p = new PrintStream(out);
				p.println(mes_Html);  //读取该文件

			} catch (IOException e) {
				
				e.printStackTrace();
			}
		}
		else  //存在就覆盖更新
			{
				try {
					out = new FileOutputStream(textFile);
				} catch (FileNotFoundException e) {
					
					e.printStackTrace();
				}
				p = new PrintStream(out);
				p.println(mes_Html);	
			}
		
	}
	public static String getPageText(String website)
	  
	  {
		  StringBuffer result=new StringBuffer();
		  
		  try {
			URL url=new URL(website);
			URLConnection con=url.openConnection();
			con.connect();
			InputStream in=con.getInputStream();
			BufferedReader reader=new BufferedReader(new InputStreamReader(in));
			String line=null;
			while((line=reader.readLine())!=null)
			{
				result.append(new String(line.getBytes(),"utf-8")+"\n");
				
			}
			reader.close();
		} catch (MalformedURLException e) {
		
			e.printStackTrace();
		} catch (IOException e) {
			
			e.printStackTrace();
		}
		  
		
		  
		  System.out.println("-----------------"+website+"\n共有"+(result.toString().length())+"个字符-------------------------"); //test the length of the website
		 // System.out.println(result.toString());  
		  return result.toString();
		 
	  }
	

	
}

然后是栈类：

<pre name="code" class="java">package friday;


public class MyStack {
	private String[] stackArray;
	private int top;
	private int maxSize;

	
	
	public MyStack(int num) {
		maxSize = num;
		top = -1;
		stackArray = new String[maxSize];
	}

	public void push(String spe) {   //进

		stackArray[++top]= spe;
	

	}

	public String pop() {    //出
		return stackArray[top--];
	}

	public String peek() {
		return stackArray[top];   //查看
	}

	public boolean isEmpty() {
		return (top == -1);   //判断是否为空
	}

	public boolean isFull() {
		return (top == maxSize - 1);  //是否满了
	}

	public void traverse() {    //遍历每一个
		for (int i = 0; i <= top; i++) {
			System.out.println("the result of" + i + " is: " + stackArray[i]);
		}
	}

	public static void main(String[] args) {
	
		MyStack stack = new MyStack(10);
		stack.push("41");
		stack.push("ew");
		stack.push("www.ace.com");
		stack.push("1w");
		stack.push("2w");

	
		while (!stack.isEmpty()) {
			String value = stack.pop();
			System.out.println(value);
		}
		
		
		System.out.println("OK");

		

	}

}