中国特色小镇网文章爬取

最新推荐文章于 2025-01-20 09:11:57 发布

糖

最新推荐文章于 2025-01-20 09:11:57 发布

阅读量120

点赞数 1

本文链接：https://blog.youkuaiyun.com/weixin_48252774/article/details/115160062

版权

import os
os.chdir('C:/Users/wenwen/Desktop')

import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time
import pandas as pd

url = 'http://www.chntsxz.cn/forum.php?mod=viewthread&tid=1425'
driver = webdriver.Chrome()
driver.get(url)
time.sleep(5)

html = driver.page_source
soup = bs(html,'html.parser')
title = soup.title
title.string

'复制不了乌镇，拿什么感动游客？ - 专题研究中国特色小镇网 ’

body = soup.find(class_='t_fsz')
body = body.text
body=body.replace('\n','')
body=body.replace('\xa0','')
body=body.replace('下载次数','')
body=body.replace('下载附件','')
body=body.replace('上传','')
body

driver.find_element_by_class_name('ant-pagination-next').click()

soup.find_all(class_='y')

page=[1425,1411,1399,1391,1383,1380,1369,1366,1355,1346,1324,1301,1290,1283,1281,1248,1246,1225,1222,1215,1210,1195,1174,1172,1141,1129,1114,
      1102,1093,1089]
paper=[]
for i in page:
    url = 'http://www.chntsxz.cn/forum.php?mod=viewthread&tid={}'.format(i)
    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(5)
    html = driver.page_source
    soup = bs(html,'html.parser')
    title = soup.title
    title = title.string
    print(title)
    body = soup.find(class_='t_fsz')
    body = body.text
    body=body.replace('\n','')
    body=body.replace('\xa0','')
    body=body.replace('下载次数','')
    body=body.replace('下载附件','')
    body=body.replace('上传','')
    print(body)
    paper.append(title)
    paper.append(body)
    with open(r'D:\特色小镇.txt', 'a') as f:
        f.write(title)
        f.write(body)
        f.write('\n')
    driver.quit()

title_all=[]
for i in range(154):
    print(paper[2*i])
    title_all.append(paper[2*i])

paper_all=[]
for i in range(154):
    paper_all.append(paper[2*i+1])