购买:四叶天代理IP_Http/https动态IP_Socks5静态ip--四叶天官网
拿取url:
代码部分:
package com.qcby.qz.util; import com.alibaba.fastjson2.JSON; import com.alibaba.fastjson2.JSONArray; import com.alibaba.fastjson2.JSONObject; import com.qcby.qz.Thread.GuoZiThread; import com.qcby.qz.Thread.JiuYeThread; import com.qcby.qz.entity.AgencyIp; import com.qcby.qz.entity.JobJY; import org.apache.http.HttpHost; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpPost; import org.apache.http.entity.StringEntity; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import org.apache.http.util.EntityUtils; import org.springframework.scheduling.annotation.Scheduled; import javax.annotation.PostConstruct; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.InetSocketAddress; import java.net.Proxy; import java.net.URL; import java.util.*; import java.util.concurrent.CopyOnWriteArraySet; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; public class IpPoolUtil { //IP池 private static Set<AgencyIp> ipSet = new CopyOnWriteArraySet<>(); static ExecutorService pool = null; @PostConstruct //被注解的方法,在对象加载完依赖注入后执行。 public static void init(){ CustomThreadPoolExecutor exec = new CustomThreadPoolExecutor(); // 1.初始化 exec.init(); pool = exec.getCustomThreadPoolExecutor(); } // @Scheduled(fixedRate=5000) // public static void scheduleIpPool(){ // try { // getIpList(); // } catch (IOException e) { // e.printStackTrace(); // } //// final int count = ipSet.size(); //// log.info("IP代理池数量为:" + count); //// if (count>500){ //// List<AgencyIp> list = new ArrayList<>(ipSet); //// Collections.sort(list); //// log.info("集合数量:" + list.size()); //// for(int i = 1; i <= 100; i++){ //// list.remove(count - i); //// } //// ipSet.clear(); //// ipSet.addAll(list); //// } // } //如果一次性拿取多个IP就会出现使用一个IP,其他IP不能继续使用的情况 private static void getIpList() throws IOException { System.out.println("正在抓取IP......"); String apiUrl = "http://proxy.siyetian.com/apis_get.html?token=AesJWLORVS51ERJdXTqFFeNRVS04kajlXTB1STqFUeORUR41kanBTTqVVNNp3a59EVNlnTENGM.QM0MjMwkjMzcTM&limit=1&type=0&time=&data_format=json"; String resultJsonStr = getData(apiUrl); JSONObject jsonObject = JSON.parseObject(resultJsonStr); JSONArray data = (JSONArray) jsonObject.get("data"); final AgencyIp agencyIp = new AgencyIp(); // //测试是否将失效的ip移除 // AgencyIp test = new AgencyIp(); // test.setAddress("121.232.76.145"); // test.setPort(13657); // ipSet.add(test); for (int i = 0;i<data.size();i++){ agencyIp.setAddress(data.getJSONObject(i).get("ip").toString()); agencyIp.setPort((int)data.getJSONObject(i).get("port")); //开启子线程查看该IP是否可用(这里可以选用线程池) pool.execute(new Runnable() { @Override public void run() { if (checkIpAddress(agencyIp)){ ipSet.add(agencyIp); } } }); pool.execute(new Runnable() { @Override public void run() { for (AgencyIp agencyIp : ipSet) { if (!checkIpAddress(agencyIp)) { // 如果IP失效,立即获取新的IP try { ipSet.remove(agencyIp); System.out.println("ip失效"); getIpList(); } catch (IOException e) { e.printStackTrace(); } break; // 只获取一个新的IP } } } }); } } /** * 获取指定url内容 * @param requestUrl * @return * @throws IOException */ private static String getData(String requestUrl) throws IOException { URL url = new URL(requestUrl); HttpURLConnection conn = (HttpURLConnection) url.openConnection(); conn.setRequestMethod("GET"); conn.connect(); BufferedReader reader = new BufferedReader(new InputStreamReader( conn.getInputStream(), "utf-8")); StringBuffer buffer = new StringBuffer(); String str = ""; while ((str = reader.readLine()) != null) { buffer.append(str); } if (buffer.length()==0){ buffer.append("[]"); } String result = buffer.toString(); reader.close(); System.gc(); conn.disconnect(); return result; } /** * 检查IP地址是否有效 * @param agencyIp * @return */ private static boolean checkIpAddress(AgencyIp agencyIp) { // 使用java.net.Proxy类设置代理IP Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(agencyIp.getAddress(), agencyIp.getPort())); HttpURLConnection connection = null; try { connection = (HttpURLConnection)new URL("http://www.baidu.com/").openConnection(proxy); } catch (IOException ex) { ex.printStackTrace(); } connection.setConnectTimeout(3000); connection.setReadTimeout(3000); connection.setUseCaches(false); try { if(connection.getResponseCode() == 200) { System.out.println(agencyIp.getAddress()+"该IP有效"); return true; } } catch (IOException e) { System.out.println(agencyIp.getAddress()+"该IP失效"); deleteIp(agencyIp); //清除该失效的IP return false; } return false; } /** * //清除无效ip :因为ip是有稳定时长的,如果出现了问题应该去测试,不能用了就直接清除 * @param agencyIp * @return */ public static boolean deleteIp(AgencyIp agencyIp){ return ipSet.remove(agencyIp); } public static void main(String[] args) throws IOException, InterruptedException { init(); getIpList(); pool.shutdown(); // 禁止提交新任务 pool.awaitTermination(1, TimeUnit.MINUTES); // 等待线程池中的任务完成 AgencyIp[] ipArray = ipSet.toArray(new AgencyIp[0]); // 创建固定大小的线程池,这里假设我们只需要两个线程 ExecutorService executorService = Executors.newFixedThreadPool(10); // 提交GuoZiThread任务到线程池 executorService.submit(new GuoZiThread(ipArray[0].getAddress(), ipArray[0].getPort())); // 提交JiuYeThread任务到线程池 executorService.submit(new JiuYeThread()); //关闭线程池(不再接受新任务,但已提交的任务会继续执行) executorService.shutdown(); // GuoZiThread guoZiThread = new GuoZiThread(ipArray[0].getAddress(),ipArray[0].getPort()); // JiuYeThread jiuYeThread = new JiuYeThread(); // // Thread guoZi = new Thread(guoZiThread); // Thread jiuYe = new Thread(jiuYeThread); // // guoZi.start(); // jiuYe.start(); } }
package com.qcby.czspringbootdemo.util;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
/**
* @auther: wangzhen
* @date: 19-8-20 11:33
* @description:
*/
public class CustomThreadPoolExecutor {
private ThreadPoolExecutor pool = null;
/**
* 线程池初始化方法
*
* corePoolSize 核心线程池大小----1
* maximumPoolSize 最大线程池大小----3
* keepAliveTime 线程池中超过corePoolSize数目的空闲线程最大存活时间----30+单位TimeUnit
* TimeUnit keepAliveTime时间单位----TimeUnit.MINUTES
* workQueue 阻塞队列----new ArrayBlockingQueue<Runnable>(5)====5容量的阻塞队列
* threadFactory 新建线程工厂----new CustomThreadFactory()====定制的线程工厂
* rejectedExecutionHandler 当提交任务数超过maxmumPoolSize+workQueue之和时,
* 即当提交第41个任务时(前面线程都没有执行完,此测试方法中用sleep(100)),
* 任务会交给RejectedExecutionHandler来处理
*/
public void init() {
pool = new ThreadPoolExecutor(
10,
30,
30,
TimeUnit.MINUTES,
new ArrayBlockingQueue<Runnable>(5),
new CustomThreadFactory(),
new CustomRejectedExecutionHandler());
}
public void destory() {
if(pool != null) {
pool.shutdownNow();
}
}
public ExecutorService getCustomThreadPoolExecutor() {
return this.pool;
}
private class CustomThreadFactory implements ThreadFactory {
private AtomicInteger count = new AtomicInteger(0);
@Override
public Thread newThread(Runnable r) {
Thread t = new Thread(r);
String threadName = CustomThreadPoolExecutor.class.getSimpleName() + count.addAndGet(1);
// System.out.println(threadName);
t.setName(threadName);
return t;
}
}
private class CustomRejectedExecutionHandler implements RejectedExecutionHandler {
@Override
public void rejectedExecution(Runnable r, ThreadPoolExecutor executor) {
try {
// 核心改造点,由blockingqueue的offer改成put阻塞方法
executor.getQueue().put(r);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
// 测试构造的线程池
// public static void main(String[] args) {
//
// CustomThreadPoolExecutor exec = new CustomThreadPoolExecutor();
// // 1.初始化
// exec.init();
//
// ExecutorService pool = exec.getCustomThreadPoolExecutor();
// for(int i=1; i<100; i++) {
// System.out.println("提交第" + i + "个任务!");
// pool.execute(new Runnable() {
// @Override
// public void run() {
// try {
// System.out.println(">>>task is running=====");
// TimeUnit.SECONDS.sleep(10);
// } catch (InterruptedException e) {
// e.printStackTrace();
// }
// }
// });
// }
//
// // 2.销毁----此处不能销毁,因为任务没有提交执行完,如果销毁线程池,任务也就无法执行了
// // exec.destory();
//
// try {
// Thread.sleep(10000);
// } catch (InterruptedException e) {
// e.printStackTrace();
// }
// }
}
package com.qcby.czspringbootdemo.entity;
public class AgencyIp {
private String address;
private int port;
public AgencyIp() {
}
public AgencyIp(String address, int port) {
this.address = address;
this.port = port;
}
public String getAddress() {
return address;
}
public void setAddress(String address) {
this.address = address;
}
public int getPort() {
return port;
}
public void setPort(int port) {
this.port = port;
}
@Override
public String toString() {
return "AgencyIp{" +
"address='" + address + '\'' +
", port=" + port +
'}';
}
}
1.将上面获取的url放到图中所示位置。
代码中比较重要的方法:
getIpList(): 作用:将获取的ip存入ipSet里面。 该方法里面的 pool.execute(new Runnable() { @Override public void run() { if (checkIpAddress(agencyIp)){ ipSet.add(agencyIp); } } }); 作用是开启一个线程,在代码结束前,该线程会通过checkIpAddress(AgencyIp agencyIp)方法不断将可以使用的IP存入ipSet里面。 在此代码的基础上,额外添加一个线程,如下: pool.execute(new Runnable() { @Override public void run() { for (AgencyIp agencyIp : ipSet) { if (!checkIpAddress(agencyIp)) { // 如果IP失效,立即获取新的IP try { ipSet.remove(agencyIp); System.out.println("ip失效"); getIpList(); } catch (IOException e) { e.printStackTrace(); } break; // 只获取一个新的IP } } } }); 该线程的作用是,不断遍历ipSet,一旦ipSet里面存在失效ip,就将这个ip去除并且重新遍历getIpList()方法。
getData(String requestUrl):根据四叶天网站url获得对应的ip。
checkIpAddress(AgencyIp agencyIp):使用ip访问百度的网址,如果访问成功,证明ip有效,存入ipSet,否则不存入。
运行流程:
public static void main(String[] args) throws IOException, InterruptedException { init(); getIpList(); pool.shutdown(); // 禁止提交新任务 pool.awaitTermination(1, TimeUnit.MINUTES); // 等待线程池中的任务完成 }
在上面步骤中,代码会通过getIpList();将有用的ip存入到ipSet里面。
注意,这里需加入pool.shutdown(); pool.awaitTermination(1, TimeUnit.MINUTES); 两串代码来手动关闭进程,否者ipSet会卡在getIpList()方法里面的进程中出不来,这样会导致在mian方法中获取ipSet的值为空。
AgencyIp[] ipArray = ipSet.toArray(new AgencyIp[0]);
将ip存入到ipSet里面后,将ipSet转为数组类型。
// 创建固定大小的线程池,这里假设我们只需要两个线程 ExecutorService executorService = Executors.newFixedThreadPool(10); // 提交GuoZiThread任务到线程池 executorService.submit(new GuoZiThread(ipArray[0].getAddress(), ipArray[0].getPort())); // 提交JiuYeThread任务到线程池 executorService.submit(new JiuYeThread()); //关闭线程池(不再接受新任务,但已提交的任务会继续执行) executorService.shutdown();
依次将数组中的ip通过线程的方式放入写好的爬虫代码