package com.gxzhuangxing.seed.modules.release.service;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Random;
import javax.servlet.ServletContext;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.quartz.Scheduler;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.web.context.ContextLoader;
import org.springframework.web.context.WebApplicationContext;
import com.gxzhuangxing.seed.modules.release.entity.Release;
public class Test {
//线程锁
private int releaseLock = 0;
//这个是总管理类
private Scheduler scheduler;
/**
* 监测预警:定时器(二)保存网络数据 逐条执行
* @param oneHtml
* @return
*/
public void saveSpider(){
System.out.println("----------------------------------------------");
if(releaseLock!=0)return;
releaseLock=1;
String oneHtmlURL="http://www.gxny.gov.cn/news/ncpzlyaq/";
if(this.getOneHtml(oneHtmlURL)==true){
Document doc;
Element elt;
try {
//获取整个页面
doc = Jsoup.connect(oneHtmlURL).timeout(1800000).get();
//获取需要的部分
elt=doc.getElementsByClass("subright1concon").get(0);
//分页的获取
Elements pages = elt.getElementsByTag("script");
String scriptOne = pages.get(0).toString();
scriptOne=scriptOne.substring(scriptOne.indexOf("(")+1, scriptOne.lastIndexOf(")"));
String[] pageCont=scriptOne.split(",");
int pageAll =Integer.parseInt(pageCont[0]);
String pageUrl="";
for(int i=0;i<pageAll;i++){
try{
pageUrl=oneHtmlURL+pageCont[2].replace("\"", "").trim()+"_"+i+"."+pageCont[3].replace("\"", "").trim();
if(i==0)pageUrl=oneHtmlURL;
if(this.getOneHtml(pageUrl)==false)continue;
doc = Jsoup.connect(pageUrl).timeout(1800000).get();
//获取需要的部分
elt=doc.getElementsByClass("subright1concon").get(0);
//获取节点连接
Elements links = elt.getElementsByTag("a");
for (Element link : links) {
try{
String content="";
String imageUrl="";
String imageUrls="";
Element contentOne=null;
//获取节点url
String url=oneHtmlURL+link.attr("href");
//获取title
String title=link.attr("title").toString();
//线程休眠;防止访问频繁造成访问不了页面现象
this.threadSleep();
//验证是否已存在数据库内
// boolean isExists = this.uriTitle(url, title);
// if(isExists==true)continue;
System.out.println(title);
//获取内容
Elements contentAll=Jsoup.connect(url).timeout(1800000).get().getElementsByClass("TRS_Editor");
if(contentAll==null||contentAll.size()==0){
contentOne=Jsoup.connect(url).timeout(1800000).get().getElementById("mcontent");
}else{
contentOne=contentAll.get(0);
}
content=contentOne.toString();
//内容中图片处理
Elements images=contentOne.getElementsByTag("img");
if(images!=null){
for(Element image : images){
//判断连接是否存在
String isUrl=image.attr("src");
if(isUrl==null||isUrl.trim().equals("")||isUrl.trim().equals("#")){
continue;
}
imageUrl=url.toString().substring(0, url.toString().lastIndexOf("/"))+isUrl.substring(1, isUrl.length());
//图片验证,过滤掉url网址上不正确的图片路径
if(this.getOneHtml(imageUrl)==false)imageUrl="";
//下载图片
imageUrls+=imageUrl+",";
/**
* 保存数据时验证后再处理下载(需优化逻辑)
*/
// this.inputImages(imageUrl);
//文件名
String fileName=imageUrl.substring(imageUrl.lastIndexOf("/")+1, imageUrl.length());
//将内容图片路径替换成服务器图片路径
String filePath=this.savePath("${ctxStatic}/../upload/release/");
String fileUrl=filePath+fileName;
content=content.replace(isUrl, fileUrl);
}
}
this.saveSpider(url, title, content, imageUrls);
}catch (Exception e){
System.out.println("获取本‘条’数据,出现异常:"+e);
continue;
}
}
}catch (Exception e){
System.out.println("获取本‘页’数据,出现异常:"+e);
continue;
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
releaseLock=0;
}
public Scheduler getScheduler() {
return scheduler;
}
public void setScheduler(Scheduler scheduler) {
this.scheduler = scheduler;
}
/**
* 下载url上的图片到本地
* @param imageUrl 下载地址
* @param filePath 保存文件夹地址
* @return
*/
@Transactional(readOnly = false)
public void inputImages(String imageUrl,String filePath){
try {
// 构造URL
String fileName=imageUrl.substring(imageUrl.lastIndexOf("/")+1, imageUrl.length());
String fileUrl=filePath+fileName;
URL url = new URL(imageUrl);
System.out.println(imageUrl);
System.out.println(fileUrl);
// 打开连接
HttpURLConnection con = (HttpURLConnection) url.openConnection();
//设置请求超时为10s
con.setConnectTimeout(180*1000);
// 输入流
InputStream is = con.getInputStream();
// 1K的数据缓冲
byte[] bs = new byte[1024];
// 读取到的数据长度
int len;
// 输出的文件流
File sf=new File(filePath);
if(sf.isDirectory()&&!sf.exists()){
sf.mkdirs();
}
OutputStream os = new FileOutputStream(fileUrl);
// 开始读取
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
// 完毕,关闭所有链接
os.close();
is.close();
} catch (MalformedURLException e) {
System.out.println("不是正确的URL"+e);
e.printStackTrace();
} catch (IOException e) {
System.out.println("获取不到URL,出现异常:"+e);
e.printStackTrace();
}
}
/**
* 本地存储图片路径 处理器(按天区分)
* @param url
* @return
*/
@Transactional(readOnly = false)
public String savePath(String url){
Date date =new Date();
SimpleDateFormat sdf=new SimpleDateFormat("yyyyMMdd");
String dateNow =sdf.format(date);
String filePath=url+dateNow+"/";
File file = new File(filePath);
if(!file.exists()&&!file.isDirectory())file.mkdir();
return filePath;
}
/**
* 线程休眠;防止访问过于频繁造成访问不了页面现象
*/
public void threadSleep(){
Thread t =Thread.currentThread();
int max=1000;
int min=100;
Random random = new Random();
int s = random.nextInt(max)%(max-min+1) + min;
try {
t.sleep(s);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
/**
* 验证url
* @param oneHtmlURL
* @return
*/
@Transactional(readOnly = false)
public boolean getOneHtml(String oneHtmlURL){
URL url;
int responsecode;
HttpURLConnection urlConnection;
try{
//生成一个URL对象,要获取源代码的网页地址为:http://www.sina.com.cn
url=new URL(oneHtmlURL);
//打开URL
urlConnection = (HttpURLConnection)url.openConnection();
//获取服务器响应代码
responsecode=urlConnection.getResponseCode();
if(responsecode==200){
return true;
}
else{
System.out.println("获取不到网页的源码,服务器响应代码为:"+responsecode);
return false;
}
}
catch(Exception e){
System.out.println("获取不到网页的源码,出现异常:"+e);
return false;
}
}
/**
* 保存数据
* @param String url, String title, String content, String imagesURL(文章路径,文章题目,文章内容,文章内容中的图片)
*/
@Transactional(readOnly = false)
public void saveSpider(String url, String title, String content, String imagesURL){
Release release = new Release();
String fileUrl="";
//下载图片
if(imagesURL!=null&&!imagesURL.trim().equals("")){
String[] imageUrls = imagesURL.split(",");
for(int i=0;i<imageUrls.length;i++){
if(imageUrls[i]==null||imageUrls[i].trim().equals(""))continue;
String fileName=imageUrls[i].substring(imageUrls[i].lastIndexOf("/")+1, imageUrls[i].length());
WebApplicationContext webApplicationContext = ContextLoader.getCurrentWebApplicationContext();
ServletContext servletContext = webApplicationContext.getServletContext();
String filePath=this.savePath(servletContext.getRealPath("/").replace("\\", "/")+"upload/release/");
fileUrl+=filePath+fileName+",";
this.inputImages(imageUrls[i],filePath);
}
}
release.setCopyfrom(url);
release.setTitle(title);
release.setContent(content);
release.setImagesUrl(fileUrl==null?"":fileUrl);
release.setInputImagesUrl(imagesURL==null?"":imagesURL);
// releaseDao.save(release);
}
public static void main(String[] arg){
Test ss=new Test();
ss.saveSpider();
}
}