IP代理池

购买:四叶天代理IP_Http/https动态IP_Socks5静态ip--四叶天官网

拿取url:

代码部分:

package com.qcby.qz.util;

import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONArray;
import com.alibaba.fastjson2.JSONObject;
import com.qcby.qz.Thread.GuoZiThread;
import com.qcby.qz.Thread.JiuYeThread;
import com.qcby.qz.entity.AgencyIp;
import com.qcby.qz.entity.JobJY;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.springframework.scheduling.annotation.Scheduled;

import javax.annotation.PostConstruct;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.URL;
import java.util.*;
import java.util.concurrent.CopyOnWriteArraySet;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;

public class IpPoolUtil {
    //IP池
    private static Set<AgencyIp> ipSet = new CopyOnWriteArraySet<>();

    static ExecutorService pool = null;

    @PostConstruct  //被注解的方法,在对象加载完依赖注入后执行。
    public static void init(){
        CustomThreadPoolExecutor exec = new CustomThreadPoolExecutor();
        // 1.初始化
        exec.init();
        pool = exec.getCustomThreadPoolExecutor();
    }

//    @Scheduled(fixedRate=5000)
//    public static void scheduleIpPool(){
//        try {
//            getIpList();
//        } catch (IOException e) {
//            e.printStackTrace();
//        }
////        final int count = ipSet.size();
////        log.info("IP代理池数量为:" + count);
////        if (count>500){
////            List<AgencyIp> list = new ArrayList<>(ipSet);
////            Collections.sort(list);
////            log.info("集合数量:" + list.size());
////            for(int i = 1; i <= 100; i++){
////                list.remove(count - i);
////            }
////            ipSet.clear();
////            ipSet.addAll(list);
////        }
//    }

    //如果一次性拿取多个IP就会出现使用一个IP,其他IP不能继续使用的情况
    private static void getIpList() throws IOException {
        System.out.println("正在抓取IP......");
        String apiUrl = "http://proxy.siyetian.com/apis_get.html?token=AesJWLORVS51ERJdXTqFFeNRVS04kajlXTB1STqFUeORUR41kanBTTqVVNNp3a59EVNlnTENGM.QM0MjMwkjMzcTM&limit=1&type=0&time=&data_format=json";
        String resultJsonStr = getData(apiUrl);
        JSONObject jsonObject = JSON.parseObject(resultJsonStr);
        JSONArray data = (JSONArray) jsonObject.get("data");
        final AgencyIp agencyIp = new AgencyIp();
//        //测试是否将失效的ip移除
//        AgencyIp test = new AgencyIp();
//        test.setAddress("121.232.76.145");
//        test.setPort(13657);
//        ipSet.add(test);

        for (int i = 0;i<data.size();i++){
            agencyIp.setAddress(data.getJSONObject(i).get("ip").toString());
            agencyIp.setPort((int)data.getJSONObject(i).get("port"));

            //开启子线程查看该IP是否可用(这里可以选用线程池)
            pool.execute(new Runnable() {
                @Override
                public void run() {
                    if (checkIpAddress(agencyIp)){
                        ipSet.add(agencyIp);
                    }
                }
            });
            pool.execute(new Runnable() {
                @Override
                public void run() {
                    for (AgencyIp agencyIp : ipSet) {
                        if (!checkIpAddress(agencyIp)) {
                            // 如果IP失效,立即获取新的IP
                            try {
                                ipSet.remove(agencyIp);
                                System.out.println("ip失效");
                                getIpList();

                            } catch (IOException e) {
                                e.printStackTrace();
                            }
                            break; // 只获取一个新的IP
                        }
                    }
                }
            });
        }
    }

    /**
     * 获取指定url内容
     * @param requestUrl
     * @return
     * @throws IOException
     */
    private static String getData(String requestUrl) throws IOException {
        URL url = new URL(requestUrl);
        HttpURLConnection conn = (HttpURLConnection) url.openConnection();
        conn.setRequestMethod("GET");
        conn.connect();
        BufferedReader reader = new BufferedReader(new InputStreamReader(
                conn.getInputStream(), "utf-8"));
        StringBuffer buffer = new StringBuffer();
        String str = "";
        while ((str = reader.readLine()) != null) {
            buffer.append(str);
        }
        if (buffer.length()==0){
            buffer.append("[]");
        }
        String result = buffer.toString();
        reader.close();
        System.gc();
        conn.disconnect();
        return result;
    }


    /**
     * 检查IP地址是否有效
     * @param agencyIp
     * @return
     */
    private static boolean checkIpAddress(AgencyIp agencyIp) {
        // 使用java.net.Proxy类设置代理IP
        Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(agencyIp.getAddress(), agencyIp.getPort()));
        HttpURLConnection connection = null;

        try {
            connection = (HttpURLConnection)new URL("http://www.baidu.com/").openConnection(proxy);
        } catch (IOException ex) {
            ex.printStackTrace();
        }
        connection.setConnectTimeout(3000);
        connection.setReadTimeout(3000);
        connection.setUseCaches(false);

        try {
            if(connection.getResponseCode() == 200) {
                System.out.println(agencyIp.getAddress()+"该IP有效");
                return true;
            }
        } catch (IOException e) {
            System.out.println(agencyIp.getAddress()+"该IP失效");
            deleteIp(agencyIp);  //清除该失效的IP
            return false;
        }
        return false;
    }

    /**
     * //清除无效ip  :因为ip是有稳定时长的,如果出现了问题应该去测试,不能用了就直接清除
     * @param agencyIp
     * @return
     */
    public static boolean deleteIp(AgencyIp agencyIp){
        return ipSet.remove(agencyIp);
    }

    public static void main(String[] args) throws IOException, InterruptedException {
        init();
        getIpList();
        pool.shutdown(); // 禁止提交新任务
        pool.awaitTermination(1, TimeUnit.MINUTES); // 等待线程池中的任务完成

        AgencyIp[] ipArray = ipSet.toArray(new AgencyIp[0]);

        // 创建固定大小的线程池,这里假设我们只需要两个线程
        ExecutorService executorService = Executors.newFixedThreadPool(10);

        // 提交GuoZiThread任务到线程池
        executorService.submit(new GuoZiThread(ipArray[0].getAddress(), ipArray[0].getPort()));
        // 提交JiuYeThread任务到线程池
        executorService.submit(new JiuYeThread());

        //关闭线程池(不再接受新任务,但已提交的任务会继续执行)
        executorService.shutdown();



//        GuoZiThread guoZiThread = new GuoZiThread(ipArray[0].getAddress(),ipArray[0].getPort());
//        JiuYeThread jiuYeThread = new JiuYeThread();
//
//        Thread guoZi = new Thread(guoZiThread);
//        Thread jiuYe = new Thread(jiuYeThread);
//
//        guoZi.start();
//        jiuYe.start();


    }


}

package com.qcby.czspringbootdemo.util;

import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;

/**
 * @auther: wangzhen
 * @date: 19-8-20 11:33
 * @description:
 */

public class CustomThreadPoolExecutor {

    private ThreadPoolExecutor pool = null;


    /**
     * 线程池初始化方法
     *
     * corePoolSize 核心线程池大小----1
     * maximumPoolSize 最大线程池大小----3
     * keepAliveTime 线程池中超过corePoolSize数目的空闲线程最大存活时间----30+单位TimeUnit
     * TimeUnit keepAliveTime时间单位----TimeUnit.MINUTES
     * workQueue 阻塞队列----new ArrayBlockingQueue<Runnable>(5)====5容量的阻塞队列
     * threadFactory 新建线程工厂----new CustomThreadFactory()====定制的线程工厂
     * rejectedExecutionHandler 当提交任务数超过maxmumPoolSize+workQueue之和时,
     *                          即当提交第41个任务时(前面线程都没有执行完,此测试方法中用sleep(100)),
     *                                任务会交给RejectedExecutionHandler来处理
     */
    public void init() {
        pool = new ThreadPoolExecutor(
                10,
                30,
                30,
                TimeUnit.MINUTES,
                new ArrayBlockingQueue<Runnable>(5),
                new CustomThreadFactory(),
                new CustomRejectedExecutionHandler());
    }

    public void destory() {
        if(pool != null) {
            pool.shutdownNow();
        }
    }


    public ExecutorService getCustomThreadPoolExecutor() {
        return this.pool;
    }

    private class CustomThreadFactory implements ThreadFactory {

        private AtomicInteger count = new AtomicInteger(0);

        @Override
        public Thread newThread(Runnable r) {
            Thread t = new Thread(r);
            String threadName = CustomThreadPoolExecutor.class.getSimpleName() + count.addAndGet(1);
//            System.out.println(threadName);
            t.setName(threadName);
            return t;
        }
    }


    private class CustomRejectedExecutionHandler implements RejectedExecutionHandler {

        @Override
        public void rejectedExecution(Runnable r, ThreadPoolExecutor executor) {
            try {
                // 核心改造点,由blockingqueue的offer改成put阻塞方法
                executor.getQueue().put(r);
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
    }



    // 测试构造的线程池
//    public static void main(String[] args) {
//
//        CustomThreadPoolExecutor exec = new CustomThreadPoolExecutor();
//        // 1.初始化
//        exec.init();
//
//        ExecutorService pool = exec.getCustomThreadPoolExecutor();
//        for(int i=1; i<100; i++) {
//            System.out.println("提交第" + i + "个任务!");
//            pool.execute(new Runnable() {
//                @Override
//                public void run() {
//                    try {
//                        System.out.println(">>>task is running=====");
//                        TimeUnit.SECONDS.sleep(10);
//                    } catch (InterruptedException e) {
//                        e.printStackTrace();
//                    }
//                }
//            });
//        }
//
//        // 2.销毁----此处不能销毁,因为任务没有提交执行完,如果销毁线程池,任务也就无法执行了
//        // exec.destory();
//
//        try {
//            Thread.sleep(10000);
//        } catch (InterruptedException e) {
//            e.printStackTrace();
//        }
//    }
}
package com.qcby.czspringbootdemo.entity;

public class AgencyIp {
    private String address;
    private int port;

    public AgencyIp() {

    }

    public AgencyIp(String address, int port) {
        this.address = address;
        this.port = port;
    }

    public String getAddress() {
        return address;
    }

    public void setAddress(String address) {
        this.address = address;
    }

    public int getPort() {
        return port;
    }

    public void setPort(int port) {
        this.port = port;
    }

    @Override
    public String toString() {
        return "AgencyIp{" +
                "address='" + address + '\'' +
                ", port=" + port +
                '}';
    }

}

1.将上面获取的url放到图中所示位置。

代码中比较重要的方法:

getIpList(): 作用:将获取的ip存入ipSet里面。 该方法里面的 pool.execute(new Runnable() { @Override public void run() { if (checkIpAddress(agencyIp)){ ipSet.add(agencyIp); } } }); 作用是开启一个线程,在代码结束前,该线程会通过checkIpAddress(AgencyIp agencyIp)方法不断将可以使用的IP存入ipSet里面。 在此代码的基础上,额外添加一个线程,如下: pool.execute(new Runnable() { @Override public void run() { for (AgencyIp agencyIp : ipSet) { if (!checkIpAddress(agencyIp)) { // 如果IP失效,立即获取新的IP try { ipSet.remove(agencyIp); System.out.println("ip失效"); getIpList(); } catch (IOException e) { e.printStackTrace(); } break; // 只获取一个新的IP } } } }); 该线程的作用是,不断遍历ipSet,一旦ipSet里面存在失效ip,就将这个ip去除并且重新遍历getIpList()方法。

getData(String requestUrl):根据四叶天网站url获得对应的ip。

checkIpAddress(AgencyIp agencyIp):使用ip访问百度的网址,如果访问成功,证明ip有效,存入ipSet,否则不存入。

运行流程:

public static void main(String[] args) throws IOException, InterruptedException { init(); getIpList(); pool.shutdown(); // 禁止提交新任务 pool.awaitTermination(1, TimeUnit.MINUTES); // 等待线程池中的任务完成 }

在上面步骤中,代码会通过getIpList();将有用的ip存入到ipSet里面。

注意,这里需加入pool.shutdown(); pool.awaitTermination(1, TimeUnit.MINUTES); 两串代码来手动关闭进程,否者ipSet会卡在getIpList()方法里面的进程中出不来,这样会导致在mian方法中获取ipSet的值为空。

AgencyIp[] ipArray = ipSet.toArray(new AgencyIp[0]);

将ip存入到ipSet里面后,将ipSet转为数组类型。

// 创建固定大小的线程池,这里假设我们只需要两个线程 ExecutorService executorService = Executors.newFixedThreadPool(10); // 提交GuoZiThread任务到线程池 executorService.submit(new GuoZiThread(ipArray[0].getAddress(), ipArray[0].getPort())); // 提交JiuYeThread任务到线程池 executorService.submit(new JiuYeThread()); //关闭线程池(不再接受新任务,但已提交的任务会继续执行) executorService.shutdown();

依次将数组中的ip通过线程的方式放入写好的爬虫代码

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值