python多线程爬取官微信息-优快云博客

本文链接：https://blog.youkuaiyun.com/qq_42122036/article/details/105271827
#coding=utf-8
import MySQLdb
import os.path
try:
    from sshtunnel import SSHTunnelForwarder
except:
    SSHTunnelForwarder = None
import traceback
import json
import requests
import Queue
import threading
from bs4 import BeautifulSoup
import sys
import time
import re
reload(sys)
sys.setdefaultencoding('utf8')


class DbConnection(object):
    def __init__(self):
        self.conn = None

    def open(self):
        child = type(self)
        if self.conn is None:
            self.conn = MySQLdb.connect(
                host=child.host,
                port=child.port,
                user=child.user,
                passwd=child.passwd,
                db=child.db,
                charset="utf8"
            )

        return self.conn


# In[3]:
class DbSpider(DbConnection):
    host = '127.0.0.1'
    port = 3306
    user = 'root'
    passwd = '123456'
    db = 'bi'


class DbBusiness(DbConnection):
    host = '127.0.0.1'
    port = 3306
    user = 'root'
    passwd = '123456'
    db = 'carl'


def db_carl_connect():
    return DbSpider().open()


def db_business_connect():
    return DbBusiness().open()


def get_proxy():

    for i in range(20):
        print "get proxy try time:", i + 1
# 快代理
        proxy_url = requests.get(  
            "http://svip.kdlapi.com/api/getproxy/?orderid=xxxxxxxxxx&num=1&protocol=1&method=2&an_ha=1&quality=2&sep=1"
        ).text
        proxy = {"http": "http://%s" % proxy_url}
        headers = {
            'Accept - Encoding': 'gzip',
            'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
        try:
            requests.get("https://www.baidu.com/", headers=headers, proxies=proxy, timeout=1)
            time.sleep(0.5)
            return proxy
        except:
            continue
    return None


def get_weibo_official(taskQueue,lock,proxy):

    while not taskQueue.empty():
        print('Analyzing : ', threading.currentThread().name)
        job = taskQueue.get()
        print taskQueue.qsize()
        code = job[1]
        url = job[2]
        if url.find("https") == -1:    
            url = url.replace("http", "https")

        headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0',
                   'Cookie': 'SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9W5GFdE6.C7laCRkjeTxrHe45JpVF02Reo27ehqXeK.0; SUB=_2AkMpfdlUdcPxrAVRmfwQyG_gbY5H-jyaqLCiAn7uJhMyAxh77m1TqSVutBF-XIn0-PK-VwTV_U7zrWgqi1Z9I8-S;'
        }  # cookie为请求必带参数
        suid_resp = requests.get(url, proxies=proxy, headers=headers)
        html = suid_resp.text
        soup = BeautifulSoup(html, "html.parser")
        script = soup.find_all("script")
        title = str(soup.find_all("title")[0].get_text()).replace("的微博_微博","")  # type: str
        if title == "Sina Visitor System":
            if job[3] < 2:    # 设置重试次数，超过则放弃
                job[3] = job[3] + 1
                taskQueue.put(job)
                proxy = get_proxy()
                print 'weiboOfficialHtml is null, code= %s' % (code)
                continue
            else:
                continue
        elif title == "414 Request-URI Too Large":
            continue

        regex = re.compile(u'[\u4E00-\u9FA5]+')
        regex_str = re.compile('([a-zA-Z0-9])')
        res = regex.findall(title.decode('utf-8'))
        res_str = regex_str.findall(title)
        if res is None and res_str is None:
            continue

        fensi = 0   # 粉丝数
        guanzhu = 0    # 关注人数
        weibo = 0   # 微博数

        for s in script:
            if str(s).find("粉丝") != -1 and str(s).find("关注") != -1 and str(s).find("微博") != -1 and str(s).find("Pl_Core_T8CustomTriColumn__3") != -1:
                fillter = str(s).replace("\\t", "").replace("\\n", "").replace("\\r", "")
                dataHtml =json.loads( fillter[fillter.index("<script>FM.view(")+16:fillter.index(")</script>")])["html"]
                finalHtml = BeautifulSoup(dataHtml,"html.parser")
                tds = finalHtml.find_all("td")
                weiboTd = """"""
                fensiTd = """"""
                guanzhuTd = """"""
                for td in tds:
                    if (str(td).find("微博") != -1) & (str(td).find("粉丝") == -1) & (str(td).find("关注") == -1):
                        tdHtml = BeautifulSoup(str(td), "html.parser")
                        weiboTd = str(td)
                        weibo = int(
                            str(BeautifulSoup(str(tdHtml.find_all("strong")), "html.parser").get_text()).replace("[","").replace("]", ""))
                        tds.remove(td)
                for td in tds:
                    if (str(td).find("粉丝") != -1) & (str(td).find("微博") != -1) & (str(td).find("关注") == -1):
                        fensiTd = str(td).replace(weiboTd, "")
                        tdHtml = BeautifulSoup(fensiTd, "html.parser")
                        fensi = int(
                            str(BeautifulSoup(str(tdHtml.find_all("strong")), "html.parser").get_text()).replace("[","").replace("]", ""))
                        tds.remove(td)
                    elif (str(td).find("粉丝") != -1) & (str(td).find("微博") == -1) &(str(td).find("关注") == -1):
                        tdHtml = BeautifulSoup(str(td), "html.parser")
                        fensi = int(
                            str(BeautifulSoup(str(tdHtml.find_all("strong")), "html.parser").get_text()).replace("[","").replace("]", ""))
                        tds.remove(td)
                for td in tds:
                    if (str(td).find("关注") != -1) & (str(td).find("粉丝") != -1) & (str(td).find("微博") != -1):
                        guanzhuTd = str(td).replace(weiboTd, "").replace(fensiTd, "")
                        tdHtml = BeautifulSoup(guanzhuTd, "html.parser")
                        guanzhu = int(
                            str(BeautifulSoup(str(tdHtml.find_all("strong")), "html.parser").get_text()).replace("[","").replace("]", ""))
                        tds.remove(td)
                    elif(str(td).find("关注") != -1) & (str(td).find("粉丝") == -1) & (str(td).find("微博") == -1) :
                        tdHtml = BeautifulSoup(str(td), "html.parser")
                        guanzhu = int(
                            str(BeautifulSoup(str(tdHtml.find_all("strong")), "html.parser").get_text()).replace("[","").replace("]", ""))
                        tds.remove(td)

        try:
            lock.acquire()   # 获取锁，存储数据
            insert_db(code, fensi, guanzhu, weibo, title)
            print 'success:'+title

        except:
            traceback.print_exc()
            print ''

        lock.release()   # 爬取，存储整个过程执行完毕，释放锁，下一个继续


# 获取录入的官微链接
def find_weibo_seeds():
    conn = db_carl_connect()
    cursor = conn.cursor()
    cursor.execute("SET NAMES utf8")
    sql = """
            select id, code, url From seeds where `status` = 0  AND url like '%weibo.com%' ORDER BY created_at DESC;
            """
    cursor.execute(sql)
    rs = cursor.fetchall()
    cursor.close()
    conn.close()
    return rs


def insert_db(code,fans_count,follow_count,post_count,weibo_name):
    conn = db_carl_connect()
    cursor = conn.cursor()
    cursor.execute("SET NAMES utf8")

    sql = """
            INSERT INTO `weibo_official_logs` (code,fans_count,follow_count,post_count,weibo_name)
            VALUES
            ('%s','%s','%s','%s','%s')
            """ % (code, fans_count, follow_count, post_count, weibo_name)
    try:
        cursor.execute(sql)
    except:
        traceback.print_exc()
    try:
        conn.commit()
    except:
        traceback.print_exc()
        conn.rollback()
    conn.close()


if __name__ == '__main__':
    
    taskQueue = Queue.Queue()  
    jobs = find_weibo_seeds()
    parse_thread = []
    for job in jobs:
        new_job = list(job)
        new_job.append(0)
        taskQueue.put(new_job)  # 任务放入队列中
    isfull = taskQueue.full() 
    print(isfull)
#   四个线程名
    threadName =['thread-1', 'thread-2', 'thread-3', 'thread-4']
    lock = threading.Lock()
    proxy = get_proxy()
    for name in threadName:
        thread_parse = threading.Thread(
            target=get_weibo_official,
            name=name,
            args=(taskQueue, lock, proxy)
        )
        parse_thread.append(thread_parse)
        thread_parse.start()   # 启动线程

    for thread in parse_thread:
        thread.join()

    print "over"