爬虫网站限于学习!!
功能实现:扫描站点目录、按目录分页扫描页面【提取数据并正则过滤特殊符号】、数据保存到数据库,可按多线程、多进程进行爬取内容
import requests,os,xml,pymysql,threading,time,queue,_thread
from bs4 import BeautifulSoup
from lxml import html
import multiprocessing
import random
import re
user_agent = [
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
"UCWEB7.0.2.37/28/999",
"NOKIA5700/ UCWEB7.0.2.37/28/999",
"Openwave/ UCWEB7.0.2.37/28/999",
"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
# iPhone 6:
"Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25"
]
headers = {'User-Agent': random.choice(user_agent)}
'''
爬虫:www.cntour.cn
功能实现:扫描站点目录、按目录分页扫描页面【提取数据】、数据保存到数据库
可按多线程、多进程进行爬取内容
'''
def get_str_btw(s, f, b): # 内容分割
par = s.partition(f)
return (par[2].partition(b))[0][:]
class Reptile(threading.Thread):
def catalog(self):
print('*****************catalog***********************')
self.url = 'http://www.cntour.cn/'
self.G = requests.get(self.url)
soup = BeautifulSoup(self.G.content,"lxml")
for k in soup.find_all('div',class_='navBox mtop'):
# cntour网
self.T_catalog = k.find_all('li')
for x in range(0,len(self.T_catalog)):
print(str(self.T_catalog[x]))
self.ca_name = get_str_btw(str(self.T_catalog[x]), 'blank\">', '</a>')
self.ca_link = get_str_btw(str(self.T_catalog[x]),'href=\"','\"')
self.ca_link_num = get_str_btw(str(self.T_catalog[x]),'tid=','\"')
Reptile().conect_DB(0,Link=self.ca_link,Link_num=self.ca_link_num,Title=self.ca_name)
def check_link(self):
print('*****************check_link***********************')
self.Link_total = Reptile().conect_DB(6, Link=None, Link_num=None, Title=None,catalog=None)
for total_num in range(0,len(self.Link_total)):
T_catolog = str(self.Link_total[total_num]).replace('((', '(').replace('))', ')')
print(T_catolog)
self.id = get_str_btw(T_catolog, '(', ',')
self.Link_tid = get_str_btw(T_catolog,'\'','\'')
self.cata_id = get_str_btw(T_catolog,', ',',')
self.url = self.Link_tid
self.G = requests.get(self.url,headers=headers)
soup = BeautifulSoup(self.G.content, "lxml")
for y in soup.find_all('div',class_='pathBox'):
self.page = get_str_btw(str(y.find_all('a')),'class=\"text\">1/','</a')
for Num in range(1,int(self.page)+1):
self.url = self.Link_tid + '&page='
self.G = requests.get(self.url + str(Num),headers=headers)
soup = BeautifulSoup(self.G.content, "lxml")
for k in soup.find_all('div',class_='newsList'):
a = k.find_all('a')
for j in range(0,len(a)):
self.Link = get_str_btw(str(a[j]),'href=\"','\" target=')
self.Link_num = get_str_btw(str(a[j]), '/news/', '/')
self.Title = get_str_btw(str(a[j]),'target=\"_blank\">','</a>')
try:
Reptile().conect_DB(2, Link=self.Link, Link_num=self.Link_num, Title=self.Title,catalog=self.cata_id)
except Exception:
pass
else:
pass
if Num == int(self.page):
print('Update********************Update********************')
Reptile().conect_DB(7, Link=None, Link_num=self.id, Title=None,catalog=None)
def Dl_conect(self,Res):
# threadLock.acquire()
print('*****************Dl_conect***********************')
self.Res = Res
for ai in range(0,len(self.Res)):
self.Link_num = self.Res[ai][1]
self.id = self.Res[ai][2]
self.url = self.Res[ai][0]
# self.url = 'http://www.cntour.cn/news/1394/'
print(self.url)
self.G = requests.get(self.url,allow_redirects=False,headers=headers) # allow_redirects 判断是否重定向行为
if self.G.status_code != 200:
Reptile().conect_DB(5,Link='2',Link_num=str(self.id),Title=None,catalog=None)
soup = BeautifulSoup(self.G.content, "lxml")
for k in soup.find_all('div', class_='content reset'):
T_data = []
a = k.find_all('span')
if len(a) != 0:
for j in range(0, len(a)):
self.Text = get_str_btw(str(a[j]), '>', '<')
T_data.append(self.Text)
a = k.find_all('p')
if len(a) != 0:
for j in range(0, len(a)):
self.Text = get_str_btw(str(a[j]), '>', '<')
T_data.append(self.Text)
self.T_conect = str(T_data).replace('\', \'', ',').replace('[\'', '').replace('\']', '')\
.replace('\"','').replace('\'','')
self.Re = re.sub(r'xa0|u200d|ufeff|,,,|,,|u3000|\\', '', self.T_conect)
if len(self.Re) < 14000:
Reptile().conect_DB(3, Link=None, Link_num=str(self.Link_num), Title=self.Re[0:14000], catalog=None)
# print('文本已复制')
a = k.find_all('a')
if len(a) != 0:
for j in range(0,len(a)):
self.T_img = get_str_btw(str(a[j]),'src=\"','\"')
if len(self.T_img) > 0:
Reptile().conect_DB(4,Link=None,Link_num=str(self.Link_num),Title=self.T_img,catalog=None)
# print('img已下载完成')
a = k.find_all('img')
if len(a) != 0:
for j in range(0,len(a)):
self.T_img = get_str_btw(str(a[j]),'src=\"','\"')
if len(self.T_img) > 0:
Reptile().conect_DB(4,Link=None,Link_num=str(self.Link_num),Title=self.T_img,catalog=None)
# print('img已下载完成')
print('下载完成')
Reptile().conect_DB(5,Link='1',Link_num=str(self.id),Title=None,catalog=None)
# threadLock.release()
def conect_DB(self,Num,Link,Link_num,Title,catalog):
self.coon = pymysql.connect(user='fiaster',
passwd='******',
db='fiast_last',
host='10.0.93.57',
charset='utf8mb4')
self.cursor = self.coon.cursor()
if Num == 0:
self.sql_sel = "INSERT INTO `fast_last`.`burying_catalog` (`id`, `catalog`, `link`, `name`, `re_start`) " \
"VALUES (NULL, \'"+ Link_num +"\', \'"+ Link +"\', \'"+ Title +"\','0');"
print(self.sql_sel)
self.S_eat = self.cursor.execute(self.sql_sel)
self.info = self.cursor.fetchall()
self.coon.commit()
self.cursor.close()
self.coon.close()
self.Mysql_data = []
if Num == 1:
self.sql_sel = "SELECT link,link_id,id from fast_last.burying_link " \
"where re_start = '0' and link_type = '1' and catalog > '0' and link_id !=39 "+ Title +";"
# print(self.sql_sel)
self.S_eat = self.cursor.execute(self.sql_sel)
self.info = self.cursor.fetchall()
self.coon.commit()
self.cursor.close()
self.coon.close()
self.Mysql_data = []
return self.info
if Num == 2 :
self.sql_int = "INSERT INTO `fast_last`.`burying_link` (`id`, `title`, `link_type`, `link`, `link_id`, `create_time`, `re_start`, `catalog`) " \
"VALUES (Null, \'" + Title + "\' , '1' , \'" + Link + "\', \'" + Link_num + "\', NOW(), '0',"+catalog+");"
self.S_eat = self.cursor.execute(self.sql_int)
self.info = self.cursor.fetchall()
self.coon.commit()
self.cursor.close()
self.coon.close()
self.Mysql_data = []
if Num == 3:
self.sql_int = "INSERT INTO `fast_last`.`burying_context` (`id`, `link_id`, `context`) " \
"VALUES (Null, \'"+ Link_num +"\', \'"+ Title +"\');"
self.S_eat = self.cursor.execute(self.sql_int)
self.info = self.cursor.fetchall()
self.coon.commit()
self.cursor.close()
self.coon.close()
self.Mysql_data = []
# return self.info
if Num == 4:
self.sql_int = "INSERT INTO `fast_last`.`burying_img` (`id`, `link_id`, `img`) " \
"VALUES (Null, \'"+ Link_num +"\', \'"+ Title +"\');"
self.S_eat = self.cursor.execute(self.sql_int)
self.info = self.cursor.fetchall()
self.coon.commit()
self.cursor.close()
self.coon.close()
self.Mysql_data = []
if Num == 5:
self.sql_int = "UPDATE `fast_last`.`burying_link` SET `re_start`='"+ Link +"' " \
"WHERE (`id`='"+ Link_num +"');"
self.S_eat = self.cursor.execute(self.sql_int)
self.info = self.cursor.fetchall()
self.coon.commit()
self.cursor.close()
self.coon.close()
self.Mysql_data = []
if Num == 6:
self.sql_sel = "SELECT id,catalog,link from fast_last.burying_catalog " \
"WHERE re_start ='0';"
self.S_eat = self.cursor.execute(self.sql_sel)
self.info = self.cursor.fetchall()
self.coon.commit()
self.cursor.close()
self.coon.close()
self.Mysql_data = []
return self.info
if Num == 7:
self.sql_sel = "UPDATE `fast_last`.`burying_catalog` " \
"SET `re_start`='1' WHERE (`id`=\'"+Link_num+"\');"
self.S_eat = self.cursor.execute(self.sql_sel)
self.info = self.cursor.fetchall()
self.coon.commit()
self.cursor.close()
self.coon.close()
self.Mysql_data = []
return self.info
def run(self):
DB_Desc_A = 'ORDER BY id DESC'
# DB_Desc_B = 'ORDER BY id ASC'
# Res_A = Reptile().conect_DB(1, Link=None, Link_num=None, Title=DB_Desc_A, catalog=None)
# Res_B = Reptile().conect_DB(1, Link=None, Link_num=None, Title=DB_Desc_B, catalog=None)
# Reptile().Dl_conect(Res_A)
# Reptile().Dl_conectS(Res_B)
# Reptile().catalog()
Res = Reptile().conect_DB(1, Link=None, Link_num=None, Title=DB_Desc_A, catalog=None)
Reptile().Dl_conect(Res)
threadLock = threading.Lock()
threads = []
# 创建线程
if __name__ == '__main__':
thread1 = Reptile()
# thread2 = Reptile()
thread1.start()
# thread2.start()
thread1.join()
# thread2.join()