#coding=utf-8
import MySQLdb
import os.path
try:
from sshtunnel import SSHTunnelForwarder
except:
SSHTunnelForwarder = None
import traceback
import json
import requests
import Queue
import threading
from bs4 import BeautifulSoup
import sys
import time
import re
reload(sys)
sys.setdefaultencoding('utf8')
class DbConnection(object):
def __init__(self):
self.conn = None
def open(self):
child = type(self)
if self.conn is None:
self.conn = MySQLdb.connect(
host=child.host,
port=child.port,
user=child.user,
passwd=child.passwd,
db=child.db,
charset="utf8"
)
return self.conn
# In[3]:
class DbSpider(DbConnection):
host = '127.0.0.1'
port = 3306
user = 'root'
passwd = '123456'
db = 'bi'
class DbBusiness(DbConnection):
host = '127.0.0.1'
port = 3306
user = 'root'
passwd = '123456'
db = 'carl'
def db_carl_connect():
return DbSpider().open()
def db_business_connect():
return DbBusiness().open()
def get_proxy():
for i in range(20):
print "get proxy try time:", i + 1
# 快代理
proxy_url = requests.get(
"http://svip.kdlapi.com/api/getproxy/?orderid=xxxxxxxxxx&num=1&protocol=1&method=2&an_ha=1&quality=2&sep=1"
).text
proxy = {"http": "http://%s" % proxy_url}
headers = {
'Accept - Encoding': 'gzip',
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
try:
requests.get("https://www.baidu.com/", headers=headers, proxies=proxy, timeout=1)
time.sleep(0.5)
return proxy
except:
continue
return None
def get_weibo_official(taskQueue,lock,proxy):
while not taskQueue.empty():
print('Analyzing : ', threading.currentThread().name)
job = taskQueue.get()
print taskQueue.qsize()
code = job[1]
url = job[2]
if url.find("https") == -1:
url = url.replace("http", "https")
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0',
'Cookie': 'SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9W5GFdE6.C7laCRkjeTxrHe45JpVF02Reo27ehqXeK.0; SUB=_2AkMpfdlUdcPxrAVRmfwQyG_gbY5H-jyaqLCiAn7uJhMyAxh77m1TqSVutBF-XIn0-PK-VwTV_U7zrWgqi1Z9I8-S;'
} # cookie为请求必带参数
suid_resp = requests.get(url, proxies=proxy, headers=headers)
html = suid_resp.text
soup = BeautifulSoup(html, "html.parser")
script = soup.find_all("script")
title = str(soup.find_all("title")[0].get_text()).replace("的微博_微博","") # type: str
if title == "Sina Visitor System":
if job[3] < 2: # 设置重试次数,超过则放弃
job[3] = job[3] + 1
taskQueue.put(job)
proxy = get_proxy()
print 'weiboOfficialHtml is null, code= %s' % (code)
continue
else:
continue
elif title == "414 Request-URI Too Large":
continue
regex = re.compile(u'[\u4E00-\u9FA5]+')
regex_str = re.compile('([a-zA-Z0-9])')
res = regex.findall(title.decode('utf-8'))
res_str = regex_str.findall(title)
if res is None and res_str is None:
continue
fensi = 0 # 粉丝数
guanzhu = 0 # 关注人数
weibo = 0 # 微博数
for s in script:
if str(s).find("粉丝") != -1 and str(s).find("关注") != -1 and str(s).find("微博") != -1 and str(s).find("Pl_Core_T8CustomTriColumn__3") != -1:
fillter = str(s).replace("\\t", "").replace("\\n", "").replace("\\r", "")
dataHtml =json.loads( fillter[fillter.index("<script>FM.view(")+16:fillter.index(")</script>")])["html"]
finalHtml = BeautifulSoup(dataHtml,"html.parser")
tds = finalHtml.find_all("td")
weiboTd = """"""
fensiTd = """"""
guanzhuTd = """"""
for td in tds:
if (str(td).find("微博") != -1) & (str(td).find("粉丝") == -1) & (str(td).find("关注") == -1):
tdHtml = BeautifulSoup(str(td), "html.parser")
weiboTd = str(td)
weibo = int(
str(BeautifulSoup(str(tdHtml.find_all("strong")), "html.parser").get_text()).replace("[","").replace("]", ""))
tds.remove(td)
for td in tds:
if (str(td).find("粉丝") != -1) & (str(td).find("微博") != -1) & (str(td).find("关注") == -1):
fensiTd = str(td).replace(weiboTd, "")
tdHtml = BeautifulSoup(fensiTd, "html.parser")
fensi = int(
str(BeautifulSoup(str(tdHtml.find_all("strong")), "html.parser").get_text()).replace("[","").replace("]", ""))
tds.remove(td)
elif (str(td).find("粉丝") != -1) & (str(td).find("微博") == -1) &(str(td).find("关注") == -1):
tdHtml = BeautifulSoup(str(td), "html.parser")
fensi = int(
str(BeautifulSoup(str(tdHtml.find_all("strong")), "html.parser").get_text()).replace("[","").replace("]", ""))
tds.remove(td)
for td in tds:
if (str(td).find("关注") != -1) & (str(td).find("粉丝") != -1) & (str(td).find("微博") != -1):
guanzhuTd = str(td).replace(weiboTd, "").replace(fensiTd, "")
tdHtml = BeautifulSoup(guanzhuTd, "html.parser")
guanzhu = int(
str(BeautifulSoup(str(tdHtml.find_all("strong")), "html.parser").get_text()).replace("[","").replace("]", ""))
tds.remove(td)
elif(str(td).find("关注") != -1) & (str(td).find("粉丝") == -1) & (str(td).find("微博") == -1) :
tdHtml = BeautifulSoup(str(td), "html.parser")
guanzhu = int(
str(BeautifulSoup(str(tdHtml.find_all("strong")), "html.parser").get_text()).replace("[","").replace("]", ""))
tds.remove(td)
try:
lock.acquire() # 获取锁,存储数据
insert_db(code, fensi, guanzhu, weibo, title)
print 'success:'+title
except:
traceback.print_exc()
print ''
lock.release() # 爬取,存储整个过程执行完毕,释放锁,下一个继续
# 获取录入的官微链接
def find_weibo_seeds():
conn = db_carl_connect()
cursor = conn.cursor()
cursor.execute("SET NAMES utf8")
sql = """
select id, code, url From seeds where `status` = 0 AND url like '%weibo.com%' ORDER BY created_at DESC;
"""
cursor.execute(sql)
rs = cursor.fetchall()
cursor.close()
conn.close()
return rs
def insert_db(code,fans_count,follow_count,post_count,weibo_name):
conn = db_carl_connect()
cursor = conn.cursor()
cursor.execute("SET NAMES utf8")
sql = """
INSERT INTO `weibo_official_logs` (code,fans_count,follow_count,post_count,weibo_name)
VALUES
('%s','%s','%s','%s','%s')
""" % (code, fans_count, follow_count, post_count, weibo_name)
try:
cursor.execute(sql)
except:
traceback.print_exc()
try:
conn.commit()
except:
traceback.print_exc()
conn.rollback()
conn.close()
if __name__ == '__main__':
taskQueue = Queue.Queue()
jobs = find_weibo_seeds()
parse_thread = []
for job in jobs:
new_job = list(job)
new_job.append(0)
taskQueue.put(new_job) # 任务放入队列中
isfull = taskQueue.full()
print(isfull)
# 四个线程名
threadName =['thread-1', 'thread-2', 'thread-3', 'thread-4']
lock = threading.Lock()
proxy = get_proxy()
for name in threadName:
thread_parse = threading.Thread(
target=get_weibo_official,
name=name,
args=(taskQueue, lock, proxy)
)
parse_thread.append(thread_parse)
thread_parse.start() # 启动线程
for thread in parse_thread:
thread.join()
print "over"