最终版:07_中证网(Plus -Pro).py
# coding=utf-8import requestsfrom bs4 import BeautifulSoupimport ioimport sysimport ossys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码for qq in range(8): # query = input("【中证网】请输入你想搜索的内容:") query = '苏州银行' #年份 year = [2014,2015,2016,2017,2018,2019,2020,2021] #总页数 pages = [2,1,1,1,11,1,19,7] year = year[qq] pages = pages[qq] if not os.path.isdir(f'D:/桌面/爬虫-银行/中国证券网/{query}'): # 如果没有此文件夹 os.mkdir(f'D:/桌面/爬虫-银行/中国证券网/{query}') # 创建此文件夹 m = 0 for p in range(1, pages + 1): url = f'http://search.cs.com.cn/search?page={p}&channelid=215308&searchword={query}&keyword={query}&token=12.1462412070719.47&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=×cope=×copecolumn=&orderby=&timeline=={year}' dic = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"} resp = requests.get(url, headers=dic, ) resp.encoding = 'utf-8' # print(resp) print(f'
>>>--------------------第{p}页---------------------<<<
') print(f'
>>>--------------------第{p}页---------------------<<<
') print(f'
>>>--------------------第{p}页---------------------<<<
') # print(resp.text) page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器 alist = page.find_all("table") datalist = [] for ii in alist: ss=ii.find('td', style='font-size: 12px;line-height: 24px;color: #333333;margin-top: 4px;') # print('ss=
',ss) if ss != None: ss = ss.get_text() datalist.append(ss) # print('data:',datalist,len(datalist)) if not os.path.isdir(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}'): # 如果没有此文件夹 os.mkdir(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}') # 创建此文件夹 for ii in range(len(datalist)): fp = open(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}/({year}){ii + m + 1}.txt', 'w+', encoding='utf-8') fp.write(datalist[ii] + '
') # 只包含文本 print(datalist[ii]) print(f'
> > >{year}年,第{p}页,第{ii + 1}篇,成功! < < <') fp.close() m = m + len(datalist) + 1print('----------------------------')print(f'------
{year}年,爬取完毕----')print('----------------------------')
历史优化记录:01_中证网.py# coding=utf-8import requestsfrom bs4 import BeautifulSoupimport ioimport syssys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码query = input("【中证网】请输入你想搜索的内容:")pages = int(input("要爬取的页数(不小于1):"))if pages < 1: exit()url = f'http://search.cs.com.cn/search?channelid=215308&perpage=&templet=&token=12.1462412070719.47&searchword={query}'dic = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 " "Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"}resp = requests.get(url, headers=dic, )resp.encoding = 'utf-8'# print(resp)# print(resp.text)page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器alist = page.find("table").find_all("a")# print(alist)weblist = []for a in alist: if a.get('href')[:5] == "https": weblist.append(a.get('href'))# ----------------单页每个文章---------------------------------m = 0for ii in range(len(weblist)): url_a = weblist[ii] # print('0=',url_a) dic_a = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 " "Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"} resp_a = requests.get(url_a, headers=dic_a, ) resp_a.encoding = 'gbk' # print('New:
',resp_a.text) page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器 # print('123:
',page_a) page_b = page_a.find('section').find_all('p') # print(page_b) fp=open(f'D:/桌面/爬虫-银行/中国证券网/中国银行/{ii+1}.txt','w+',encoding='utf-8') txt_list = [] for txt_a in page_b: # print(txt_a.text) txt_list.append(txt_a.text) # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # ++++++++++++++++++++++文本写入+++++++++++++++++++++++++++++++ for i in range(len(txt_list)): fp.write(txt_list[i] + '
') # 只包含文本 fp.close() print(f'>>{ii+1}成功!') m = ii+1# +-+++-----------++++++++++-----多页------++++++++++++----------++++if pages > 1: for p in range(pages): url_s = f"http://search.cs.com.cn/search?page={p+1}&channelid=215308&searchword={query}" resp = requests.get(url, headers=dic, ) resp.encoding = 'utf-8' # print(resp) # print(resp.text) page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器 alist = page.find("table").find_all("a") # print(alist) weblist = [] for a in alist: if a.get('href')[:5] == "https": weblist.append(a.get('href')) # ----------------单页每个文章--------------------------------- for ii in range(len(weblist)): url_a = weblist[ii] # print('0=',url_a) dic_a = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 " "Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"} resp_a = requests.get(url_a, headers=dic_a, ) resp_a.encoding = 'gbk' # print('New:
',resp_a.text) page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器 # print('123:
',page_a) page_b = page_a.find('section').find_all('p') # print(page_b) fp = open(f'D:/桌面/爬虫-银行/中国证券网/中国银行/{ii + 1 + m}.txt', 'w+', encoding='utf-8') txt_list = [] for txt_a in page_b: # print(txt_a.text) txt_list.append(txt_a.text) # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # ++++++++++++++++++++++文本写入+++++++++++++++++++++++++++++++ for i in range(len(txt_list)): fp.write(txt_list[i] + '
') # 只包含文本 print(f'>>{ii + 1 + m}成功!') m = m + ii + 1fp.close()print('---------------
>>>爬取完毕<<<')
历史优化记录:02_中证网.py# coding=utf-8import requestsfrom bs4 import BeautifulSoupimport ioimport syssys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码query = input("【中证网】请输入你想搜索的内容:")pages = int(input("要爬取的页数(不小于1):"))if pages < 1: exit()url = f'http://search.cs.com.cn/search?page=1&channelid=215308&searchword={query}'dic = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 " "Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"}resp = requests.get(url, headers=dic, )resp.encoding = 'utf-8'# print(resp)# print(resp.text)page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器alist = page.find("table").find_all("a")# print(alist)weblist = []for a in alist: if a.get('href')[:5] == "https": weblist.append(a.get('href'))# ----------------单页每个文章---------------------------------m = 0for ii in range(len(weblist)): url_a = weblist[ii] # print('0=',url_a) dic_a = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 " "Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"} resp_a = requests.get(url_a, headers=dic_a, ) resp_a.encoding = 'gbk' # print('New:
',resp_a.text) page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器 # print('123:
',page_a) page_b = page_a.find('section').find_all('p') # print(page_b) fp=open(f'D:/桌面/爬虫-银行/中国证券网/中国银行/0/(2021){ii+1}.txt','w+',encoding='utf-8') txt_list = [] for txt_a in page_b: # print(txt_a.text) txt_list.append(txt_a.text) # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # ++++++++++++++++++++++文本写入+++++++++++++++++++++++++++++++ for i in range(len(txt_list)): fp.write(txt_list[i] + '
') # 只包含文本 fp.close() print(f'>>{ii+1}成功!') m = ii+1# +-+++-----------++++++++++-----多页------++++++++++++----------++++# +-+++-----------++++++++++-----多页------++++++++++++----------++++if pages > 1: for p in range(pages): url_s = f"http://search.cs.com.cn/search?page={p+1}&channelid=215308&searchword={query}" resp = requests.get(url, headers=dic, ) resp.encoding = 'utf-8' # print(resp) # print(resp.text) page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器 alist = page.find("table").find_all("a") # print(alist) weblist = [] for a in alist: if a.get('href')[:5] == "https": weblist.append(a.get('href')) # ----------------单页每个文章--------------------------------- for ii in range(len(weblist)): url_a = weblist[ii] # print('0=',url_a) dic_a = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 " "Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/30"} resp_a = requests.get(url_a, headers=dic_a, ) resp_a.encoding = 'gbk' # print('New:
',resp_a.text) page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器 # print('123:
',page_a) page_b = page_a.find('section').find_all('p') # print(page_b) fp = open(f'D:/桌面/爬虫-银行/中国证券网/中国银行/0/(2021){ii + 1 + m}.txt', 'w+', encoding='utf-8') txt_list = [] for txt_a in page_b: # print(txt_a.text) txt_list.append(txt_a.text) # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # ++++++++++++++++++++++文本写入+++++++++++++++++++++++++++++++ for i in range(len(txt_list)): fp.write(txt_list[i] + '
') # 只包含文本 print(f'>>{ii + 1 + m}成功!') m = m + ii + 1fp.close()print('---------------
>>>爬取完毕<<<')
历史优化记录:03_中证网.py# coding=utf-8import requestsfrom bs4 import BeautifulSoupimport ioimport syssys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码query = input("【中证网】请输入你想搜索的内容:")pages = int(input("要爬取的页数(不小于1):"))if pages < 1: exit()m = 0for p in range(1,pages+1): url = f'http://search.cs.com.cn/search?page={p}&channelid=215308&searchword={query}&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=×cope=×copecolumn=&orderby=&timeline==2021' dic = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"} resp = requests.get(url, headers=dic, ) resp.encoding = 'utf-8' # print(resp) print(f'
>>>--------------------第{p}页---------------------<<<
') print(f'
>>>--------------------第{p}页---------------------<<<
') print(f'
>>>--------------------第{p}页---------------------<<<
') # print(resp.text) page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器 alist = page.find("table").find_all('a') weblist = [] for a in alist: if a.get('href')[:5] == "https": weblist.append(a.get('href')) # print('weblist==',weblist)# ----------------单页每个文章--------------------------------- for ii in range(len(weblist)): url_a = weblist[ii] # print('0=',url_a) dic_a = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"} resp_a = requests.get(url_a, headers=dic_a, ) resp_a.encoding = 'gbk' # print('New:
',resp_a.text) page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器 # print('123:
',page_a) page_b = page_a.find('section').find_all('p') # print(page_b) fp=open(f'D:/桌面/爬虫-银行/中国证券网/中国银行/2021/(2021){ii+m+1}.txt','w+',encoding='utf-8') txt_list = [] for txt_a in page_b: # print('txt_a===',txt_a.text) txt_list.append(txt_a.text) print(f'
-++++++++++++++++++第{ii+1}篇文章++++++++++++++++-
',txt_list,len(txt_list)) # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # ++++++++++++++++++++++文本写入+++++++++++++++++++++++++++++++ for i in range(len(txt_list)): fp.write(txt_list[i] + '
') # 只包含文本 # print('-----------------------------------') print( 黑龙江党政培训 www.sxganxun.cn f'
> > >{ii+1}成功! < < <') fp.close() m=m+len(weblist)+1print('---------------
>>>爬取完毕<<<')
历史优化记录:04_中证网(网址筛选问题).py# coding=utf-8import requestsfrom bs4 import BeautifulSoupimport ioimport syssys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码query = input("【中证网】请输入你想搜索的内容:")pages = int(input("要爬取的页数(不小于1):"))if pages < 1: exit()m = 0for p in range(1,pages+1): url = f'http://search.cs.com.cn/search?page={pages}&channelid=215308&searchword={query}&keyword={query}&token=12.1462412070719.47&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=×cope=×copecolumn=&orderby=&timeline==2020' dic = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"} resp = requests.get(url, headers=dic, ) resp.encoding = 'utf-8' # print(resp) print(f'
>>>--------------------第{p}页---------------------<<<
') print(f'
>>>--------------------第{p}页---------------------<<<
') print(f'
>>>--------------------第{p}页---------------------<<<
') # print(resp.text) page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器 alist = page.find("table").find_all('a') print('alist:',alist) weblist = [] for a in alist: if a.get('href')[4:] == "http": weblist.append(a.get('href')) print('weblist==',weblist)# ----------------单页每个文章--------------------------------- for ii in range(len(weblist)): url_a = weblist[ii] # print('0=',url_a) dic_a = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"} resp_a = requests.get(url_a, headers=dic_a, ) resp_a.encoding = 'gbk' # print('New:
',resp_a.text) page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器 # print('123:
',page_a) page_b = page_a.find('section').find_all('p') # print(page_b) fp=open(f'D:/桌面/爬虫-银行/中国证券网/中国银行/2020/(2020){ii+m+1}.txt','w+',encoding='utf-8') txt_list = [] for txt_a in page_b: # print('txt_a===',txt_a.text) txt_list.append(txt_a.text) print(f'
-++++++++++++++++++第{ii+1}篇文章++++++++++++++++-
',txt_list,len(txt_list)) # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # ++++++++++++++++++++++文本写入+++++++++++++++++++++++++++++++ for i in range(len(txt_list)): fp.write(txt_list[i] + '
') # 只包含文本 # print('-----------------------------------') print(f'
> > >{ii+1}成功! < < <') fp.close() m=m+len(weblist)+1print('---------------
>>>爬取完毕<<<')
历史优化记录:05_中证网.py# coding=utf-8import requestsfrom bs4 import BeautifulSoupimport ioimport syssys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码query = input("【中证网】请输入你想搜索的内容:")year = int(input('要爬取的年份:'))pages = int(input("要爬取的页数(不小于1):"))if pages < 1: exit()m = 0for p in range(1, pages + 1): url = f'http://search.cs.com.cn/search?page={p}&channelid=215308&searchword={query}&keyword={query}&token=12.1462412070719.47&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=×cope=×copecolumn=&orderby=&timeline=={year}' dic = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"} resp = requests.get(url, headers=dic, ) resp.encoding = 'utf-8' # print(resp) print(f'
>>>--------------------第{p}页---------------------<<<
') print(f'
>>>--------------------第{p}页---------------------<<<
') print(f'
>>>--------------------第{p}页---------------------<<<
') # print(resp.text) page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器 alist = page.find("table").find('tr').find_all('a') # print('alist:', alist) weblist = [] for a in alist: if a.get('href')[:4] == "http": weblist.append(a.get('href')) print('weblist==', weblist) # ----------------单页每个文章--------------------------------- for ii in range(len(weblist)): url_a = weblist[ii] # print('0=',url_a) dic_a = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"} resp_a = requests.get(url_a, headers=dic_a, ) resp_a.encoding = 'gbk' # print('New:
',resp_a.text) page_a = BeautifulSoup(resp_a.text, "html.parser") # 指定html解析器 # print('123:
',page_a) page_b = page_a.find_all('p') # print(page_b) fp = open(f'D:/桌面/爬虫-银行/中国证券网/中国银行/{year}/({year}){ii + m + 1}.txt', 'w+', encoding='utf-8') txt_list = [] for txt_a in page_b: # print('txt_a===',txt_a.text) txt_list.append(txt_a.text) print(f'
-++++++++++++++++++第{ii + 1}篇文章++++++++++++++++-
', txt_list, len(txt_list)) # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # ++++++++++++++++++++++文本写入+++++++++++++++++++++++++++++++ for i in range(len(txt_list)): fp.write(txt_list[i] + '
') # 只包含文本 # print('-----------------------------------') print(f'
> > >{ii + 1}成功! < < <') fp.close() m = m + len(weblist) + 1print('---------------
>>>爬取完毕<<<')
历史优化记录:06_中证网(Plus).py# coding=utf-8import requestsfrom bs4 import BeautifulSoupimport ioimport sysimport ossys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 改变标准输出的默认编码# query = input("【中证网】请输入你想搜索的内容:")query = '交通银行'year = int(input('要爬取的年份:'))pages = int(input("要爬取的页数(不小于1):"))if pages < 1: exit()m = 0for p in range(1, pages + 1): url = f'http://search.cs.com.cn/search?page={p}&channelid=215308&searchword={query}&keyword={query}&token=12.1462412070719.47&perpage=10&outlinepage=5&&andsen=&total=&orsen=&exclude=&searchscope=×cope=×copecolumn=&orderby=&timeline=={year}' dic = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"} resp = requests.get(url, headers=dic, ) resp.encoding = 'utf-8' # print(resp) print(f'
>>>--------------------第{p}页---------------------<<<
') print(f'
>>>--------------------第{p}页---------------------<<<
') print(f'
>>>--------------------第{p}页---------------------<<<
') # print(resp.text) page = BeautifulSoup(resp.text, "html.parser") # 指定html解析器 alist = page.find_all("table") datalist = [] for ii in alist: ss=ii.find('td', style='font-size: 12px;line-height: 24px;color: #333333;margin-top: 4px;') # print('ss=
',ss) if ss != None: ss = ss.get_text() datalist.append(ss) # print('data:',datalist,len(datalist)) if not os.path.isdir(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}'): # 如果没有此文件夹 os.mkdir(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}') # 创建此文件夹 for ii in range(len(datalist)): fp = open(f'D:/桌面/爬虫-银行/中国证券网/{query}/{year}/({year}){ii + m + 1}.txt', 'w+', encoding='utf-8') fp.write(datalist[ii] + '
') # 只包含文本 print(datalist[ii]) print(f'
> > >第{p}页,第{ii + 1}篇,成功! < < <') fp.close() m = m + len(datalist) + 1print('----------------------------')print(f'------
{year}年,爬取完毕----')print('----------------------------')