爬虫:requests+pymysql+BeautifulSoup+re

爬虫网站限于学习!!

功能实现:扫描站点目录、按目录分页扫描页面【提取数据并正则过滤特殊符号】、数据保存到数据库,可按多线程、多进程进行爬取内容

import requests,os,xml,pymysql,threading,time,queue,_thread
from bs4 import BeautifulSoup
from lxml import html
import multiprocessing
import random
import re
user_agent = [
    "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
    "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",
    "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
    "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
    "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",
    "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
    "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
    "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
    "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
    "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
    "UCWEB7.0.2.37/28/999",
    "NOKIA5700/ UCWEB7.0.2.37/28/999",
    "Openwave/ UCWEB7.0.2.37/28/999",
    "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999",
    # iPhone 6:
    "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25"
]
headers = {'User-Agent': random.choice(user_agent)}

'''
爬虫:www.cntour.cn
功能实现:扫描站点目录、按目录分页扫描页面【提取数据】、数据保存到数据库
可按多线程、多进程进行爬取内容
'''

def get_str_btw(s, f, b):  # 内容分割
    par = s.partition(f)
    return (par[2].partition(b))[0][:]
class Reptile(threading.Thread):
    def catalog(self):
        print('*****************catalog***********************')
        self.url = 'http://www.cntour.cn/'
        self.G = requests.get(self.url)
        soup = BeautifulSoup(self.G.content,"lxml")
        for k in soup.find_all('div',class_='navBox mtop'):
            # cntour网
            self.T_catalog = k.find_all('li')
            for x in range(0,len(self.T_catalog)):
                print(str(self.T_catalog[x]))
                self.ca_name = get_str_btw(str(self.T_catalog[x]), 'blank\">', '</a>')
                self.ca_link = get_str_btw(str(self.T_catalog[x]),'href=\"','\"')
                self.ca_link_num = get_str_btw(str(self.T_catalog[x]),'tid=','\"')
                Reptile().conect_DB(0,Link=self.ca_link,Link_num=self.ca_link_num,Title=self.ca_name)
    def check_link(self):
        print('*****************check_link***********************')
        self.Link_total = Reptile().conect_DB(6, Link=None, Link_num=None, Title=None,catalog=None)
        for total_num in range(0,len(self.Link_total)):
            T_catolog = str(self.Link_total[total_num]).replace('((', '(').replace('))', ')')
            print(T_catolog)
            self.id = get_str_btw(T_catolog, '(', ',')
            self.Link_tid = get_str_btw(T_catolog,'\'','\'')
            self.cata_id = get_str_btw(T_catolog,', ',',')
            self.url = self.Link_tid
            self.G = requests.get(self.url,headers=headers)
            soup = BeautifulSoup(self.G.content, "lxml")
            for y in soup.find_all('div',class_='pathBox'):
                self.page = get_str_btw(str(y.find_all('a')),'class=\"text\">1/','</a')
            for Num in range(1,int(self.page)+1):
                self.url = self.Link_tid + '&page='
                self.G = requests.get(self.url + str(Num),headers=headers)
                soup = BeautifulSoup(self.G.content, "lxml")
                for k in soup.find_all('div',class_='newsList'):
                    a = k.find_all('a')
                    for j in range(0,len(a)):
                        self.Link = get_str_btw(str(a[j]),'href=\"','\" target=')
                        self.Link_num = get_str_btw(str(a[j]), '/news/', '/')
                        self.Title = get_str_btw(str(a[j]),'target=\"_blank\">','</a>')
                        try:
                            Reptile().conect_DB(2, Link=self.Link, Link_num=self.Link_num, Title=self.Title,catalog=self.cata_id)
                        except Exception:
                            pass
                        else:
                            pass
                if Num == int(self.page):
                    print('Update********************Update********************')
                    Reptile().conect_DB(7, Link=None, Link_num=self.id, Title=None,catalog=None)
    def Dl_conect(self,Res):
        # threadLock.acquire()
        print('*****************Dl_conect***********************')
        self.Res = Res
        for ai in range(0,len(self.Res)):
            self.Link_num = self.Res[ai][1]
            self.id = self.Res[ai][2]
            self.url = self.Res[ai][0]
            # self.url = 'http://www.cntour.cn/news/1394/'
            print(self.url)
            self.G = requests.get(self.url,allow_redirects=False,headers=headers)  # allow_redirects 判断是否重定向行为
            if self.G.status_code != 200:
                Reptile().conect_DB(5,Link='2',Link_num=str(self.id),Title=None,catalog=None)
            soup = BeautifulSoup(self.G.content, "lxml")
            for k in soup.find_all('div', class_='content reset'):
                T_data = []
                a = k.find_all('span')
                if len(a) != 0:
                    for j in range(0, len(a)):
                        self.Text = get_str_btw(str(a[j]), '>', '<')
                        T_data.append(self.Text)
                a = k.find_all('p')
                if len(a) != 0:
                    for j in range(0, len(a)):
                        self.Text = get_str_btw(str(a[j]), '>', '<')
                        T_data.append(self.Text)
                self.T_conect = str(T_data).replace('\', \'', ',').replace('[\'', '').replace('\']', '')\
                    .replace('\"','').replace('\'','')
                self.Re = re.sub(r'xa0|u200d|ufeff|,,,|,,|u3000|\\', '', self.T_conect)
                if len(self.Re) < 14000:
                    Reptile().conect_DB(3, Link=None, Link_num=str(self.Link_num), Title=self.Re[0:14000], catalog=None)
                    # print('文本已复制')

                a = k.find_all('a')
                if len(a) != 0:
                    for j in range(0,len(a)):
                        self.T_img = get_str_btw(str(a[j]),'src=\"','\"')
                        if len(self.T_img) > 0:
                            Reptile().conect_DB(4,Link=None,Link_num=str(self.Link_num),Title=self.T_img,catalog=None)
                            # print('img已下载完成')
                a = k.find_all('img')
                if len(a) != 0:
                    for j in range(0,len(a)):
                        self.T_img = get_str_btw(str(a[j]),'src=\"','\"')
                        if len(self.T_img) > 0:
                            Reptile().conect_DB(4,Link=None,Link_num=str(self.Link_num),Title=self.T_img,catalog=None)
                            # print('img已下载完成')
                print('下载完成')
                Reptile().conect_DB(5,Link='1',Link_num=str(self.id),Title=None,catalog=None)
            # threadLock.release()
    def conect_DB(self,Num,Link,Link_num,Title,catalog):
        self.coon = pymysql.connect(user='fiaster',
                                    passwd='******',
                                    db='fiast_last',
                                    host='10.0.93.57',
                                    charset='utf8mb4')
        self.cursor = self.coon.cursor()
        if Num == 0:
            self.sql_sel = "INSERT INTO `fast_last`.`burying_catalog` (`id`, `catalog`, `link`, `name`, `re_start`) " \
                           "VALUES (NULL, \'"+ Link_num +"\', \'"+ Link +"\', \'"+ Title +"\','0');"

            print(self.sql_sel)
            self.S_eat = self.cursor.execute(self.sql_sel)
            self.info = self.cursor.fetchall()
            self.coon.commit()
            self.cursor.close()
            self.coon.close()
            self.Mysql_data = []
        if Num == 1:
            self.sql_sel = "SELECT link,link_id,id from fast_last.burying_link " \
                           "where re_start = '0' and link_type = '1' and catalog > '0'  and link_id !=39 "+ Title +";"
            # print(self.sql_sel)
            self.S_eat = self.cursor.execute(self.sql_sel)
            self.info = self.cursor.fetchall()
            self.coon.commit()
            self.cursor.close()
            self.coon.close()
            self.Mysql_data = []
            return self.info
        if Num == 2 :
            self.sql_int = "INSERT INTO `fast_last`.`burying_link` (`id`, `title`, `link_type`, `link`, `link_id`, `create_time`, `re_start`, `catalog`) " \
                           "VALUES (Null, \'" + Title + "\' , '1' , \'" + Link + "\', \'" + Link_num + "\', NOW(), '0',"+catalog+");"
            self.S_eat = self.cursor.execute(self.sql_int)
            self.info = self.cursor.fetchall()
            self.coon.commit()
            self.cursor.close()
            self.coon.close()
            self.Mysql_data = []
        if Num == 3:
            self.sql_int = "INSERT INTO `fast_last`.`burying_context` (`id`, `link_id`, `context`) " \
                           "VALUES (Null, \'"+ Link_num +"\', \'"+ Title +"\');"
            self.S_eat = self.cursor.execute(self.sql_int)
            self.info = self.cursor.fetchall()
            self.coon.commit()
            self.cursor.close()
            self.coon.close()
            self.Mysql_data = []
            # return self.info
        if Num == 4:
            self.sql_int = "INSERT INTO `fast_last`.`burying_img` (`id`, `link_id`, `img`) " \
                           "VALUES (Null, \'"+ Link_num +"\', \'"+ Title +"\');"
            self.S_eat = self.cursor.execute(self.sql_int)
            self.info = self.cursor.fetchall()
            self.coon.commit()
            self.cursor.close()
            self.coon.close()
            self.Mysql_data = []
        if Num == 5:
            self.sql_int = "UPDATE `fast_last`.`burying_link` SET `re_start`='"+ Link +"' " \
                           "WHERE (`id`='"+ Link_num +"');"
            self.S_eat = self.cursor.execute(self.sql_int)
            self.info = self.cursor.fetchall()
            self.coon.commit()
            self.cursor.close()
            self.coon.close()
            self.Mysql_data = []
        if Num == 6:
            self.sql_sel = "SELECT id,catalog,link from fast_last.burying_catalog " \
                           "WHERE re_start ='0';"
            self.S_eat = self.cursor.execute(self.sql_sel)
            self.info = self.cursor.fetchall()
            self.coon.commit()
            self.cursor.close()
            self.coon.close()
            self.Mysql_data = []
            return self.info
        if Num == 7:
            self.sql_sel = "UPDATE `fast_last`.`burying_catalog` " \
                           "SET `re_start`='1' WHERE (`id`=\'"+Link_num+"\');"
            self.S_eat = self.cursor.execute(self.sql_sel)
            self.info = self.cursor.fetchall()
            self.coon.commit()
            self.cursor.close()
            self.coon.close()
            self.Mysql_data = []
            return self.info
    def run(self):
        DB_Desc_A = 'ORDER BY id DESC'
        # DB_Desc_B = 'ORDER BY id ASC'
        # Res_A = Reptile().conect_DB(1, Link=None, Link_num=None, Title=DB_Desc_A, catalog=None)
        # Res_B = Reptile().conect_DB(1, Link=None, Link_num=None, Title=DB_Desc_B, catalog=None)
        # Reptile().Dl_conect(Res_A)
        # Reptile().Dl_conectS(Res_B)
        # Reptile().catalog()
        Res = Reptile().conect_DB(1, Link=None, Link_num=None, Title=DB_Desc_A, catalog=None)
        Reptile().Dl_conect(Res)


threadLock = threading.Lock()
threads = []


# 创建线程
if __name__ == '__main__':
    thread1 = Reptile()
    # thread2 = Reptile()
    thread1.start()
    # thread2.start()
    thread1.join()
    # thread2.join()


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值