Java爬虫抓取世界国旗-优快云博客

本文章是借鉴博客：https://blog.youkuaiyun.com/jclian91/article/details/80950334 ，好了，直接看源码：

package com.getcount;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

/**
* @ClassName: WordNationalFlag
* @Description: java爬虫爬取世界国旗
* @author lhz
* @date 2018年7月9日下午10:59:01
*/
public class WordNationalFlag {

/* 发送HTTP的POST请求，获取指定国家的网页地址
* 传入参数：country(国家): String类型
*/
public static String doPost(String country){
String url = "http://country.911cha.com/";
try{
//设置网址打开链接
URL uri = new URL(url);
HttpURLConnection conn = (HttpURLConnection)uri.openConnection();

//设置post请求头和请求体，请求体的参数位国家
conn.setUseCaches(false);
conn.setRequestMethod("POST");
String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36";
conn.setRequestProperty("User-Agent", USER_AGENT);
conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
String postParams = String.format("q=%s", country);

//传入post请求体的参数
conn.setDoOutput(true);
OutputStreamWriter osw = new OutputStreamWriter(conn.getOutputStream(),"UTF-8");
osw.write(postParams);
osw.flush();
osw.close();

//获取相应结果的状态码
int code = conn.getResponseCode();
if(code == HttpURLConnection.HTTP_OK){
//将html内容解析成utf-8内容
Document doc = Jsoup.parse(conn.getInputStream(),"utf-8",url);
//刷选需要的网页内容
String page = doc.select("div.mcon").get(1).selectFirst("ul").selectFirst("li")
.selectFirst("a").attr("href");
return page;
}else{//如果响应状态码不是200，则加载失败
return "Get page failed!";
}
}catch(Exception e){
System.out.println(e.getMessage());
}
return "Get page failed!";
}

/**
* @Title: getConnection
* @Description: getContent()函数主要实现下载指定国家的国旗
* @param 设定文件
* @return void 返回类型
* @throws
*/
public static void getConnection(String page){
String base_url = "http://country.911cha.com/";
String url = base_url + page;
try{
//利用url解析网址
URL uri = new URL(url);
//url连接
URLConnection conn = uri.openConnection();
//将html内容解析成utf-8
Document doc = Jsoup.parse(conn.getInputStream(), "utf-8",url);
//刷选需要的网页内容
Element image = doc.selectFirst("img");
String flag_name = image.attr("alt").replace("国旗", "");
String flag_url = image.attr("src");
URL urls = new URL(base_url + "/" + flag_url);
// 利用FileUtils.copyURLToFile()实现图片下载
FileUtils.copyURLToFile(urls, new File("C://Users/admin/Desktop/文档/世界国旗集/"+flag_name+ ".gif"));
System.out.println("%s国旗下载成功" +":"+ flag_name);

}catch(Exception e){
e.getMessage();
}
}

/**
* @Title: readyFileByLines
* @Description: 以每行的样式读取文件，返回ArrayList，里面的元素位每个国家的名称
* @param @param fileName
* @param @return 设定文件
* @return ArrayList<String> 返回类型
* @throws
*/

public static List<String> readyFileByLines(String fileName){

File file = new File(fileName);
BufferedReader br = null;
List<String> list = new ArrayList<String>();
try{
String str = null;
br = new BufferedReader(new InputStreamReader(new FileInputStream(file),"GBK"));
while((str = br.readLine()) != null){
list.add(str);//把有值的对象添加到集合
}
}catch(Exception e){
e.getMessage();
}finally{
if(br != null){
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return list;
}

}

测试类：

说明：世界国家集.txt：是来自与http://country.911cha.com/ 网站的国家名成；

如图：

package com.getcount;

import java.util.List;

/**
* @ClassName: WordNationalFlag
* @Description:java爬取世界国旗
* @author lhz
* @date 2018年7月9日下午10:47:23
*/
public class TestWordNationalFlag {

public static void main(String[] args) {

String fileName = "src/世界国家集.txt";
List<String> list = WordNationalFlag.readyFileByLines(fileName);
for(String country : list){
String page = WordNationalFlag.doPost(country);
if(page.indexOf("html") >= 0){
WordNationalFlag.getConnection(page);
}
}
System.out.println("国旗下载完毕");
}

}