爬取网站中空气质量数据:
1、定时任务
import cn.util.URLFecter;
import org.apache.http.client.HttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import org.springframework.stereotype.Service;
import java.util.Date;
/**
* 环境空气质量预报
*/
@Scheduled(cron = "0 45 15 * * ?")
private void getAirQualityForecast() {
System.out.println("开始执行定时服务(环境空气质量预报)!");
try {
HttpClient client = HttpClientBuilder.create().build();
String url = "http://106.37.208.228:8082/Home/Default?_=" + new Date().getTime();//1565404428085
//抓取的数据
String html = URLFecter.URLParser(client, url);
AirParse.getAirQualityForecastSubData(html));
} catch (Exception e) {
e.printStackTrace();
}
}
2、URLFecter工具类
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.util.EntityUtils;
public class URLFecter {
public static String URLParser(HttpClient client, String url) throws Exception {
//用来接收解析的数据
String entity = "";
//获取网站响应的html,这里调用了HTTPUtils类
HttpResponse response = HTTPUtils.getRawHtml(client, url);
//获取响应状态码
int StatusCode = response.getStatusLine().getStatusCode();
//如果状态响应码为200,则获取html实体内容或者json文件
if (StatusCode == 200) {
entity = EntityUtils.toString(response.getEntity(), "utf-8");
} else {
//否则,消耗掉实体
EntityUtils.consume(response.getEntity());
}
return entity;
}
}
3、AirParse工具类
import java.util.HashMap;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import com.alibaba.fastjson.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public static List<实体> getAirQualityForecastSubData(String html) {
//获取的数据,存放在集合中
List<实体> data = new ArrayList<>();
try {
//采用Jsoup解析
Document doc = Jsoup.parse(html);
//获取html标签中的内容
Elements e = doc.getElementsByTag("script").eq(12);
if (null != e) {
String cityData = e.get(0).data().split("var")[3].replace("\n", "").replace("\t", "").replace("\r", "");
cityData = cityData.substring(cityData.indexOf("[") + 1, cityData.indexOf("]") + 1);
data = JSONObject.parseArray(cityData, 实体.class);
}
} catch (Exception e) {
e.printStackTrace();
}
//返回数据
return data;
}
本文介绍了一个使用Java实现的定时任务,用于从指定网站抓取空气质量预报数据。通过Spring框架的Scheduled注解配置定时任务,利用Apache HttpClient进行网络请求,以及自定义的URLFecter和AirParse工具类解析网页数据。
5656

被折叠的 条评论
为什么被折叠?



