中国特色小镇网文章爬取

import os
os.chdir('C:/Users/wenwen/Desktop')
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time
import pandas as pd
url = 'http://www.chntsxz.cn/forum.php?mod=viewthread&tid=1425'
driver = webdriver.Chrome()
driver.get(url)
time.sleep(5)
html = driver.page_source
soup = bs(html,'html.parser')
title = soup.title
title.string

'复制不了乌镇,拿什么感动游客? - 专题研究 中国特色小镇网 ’

body = soup.find(class_='t_fsz')
body = body.text
body=body.replace('\n','')
body=body.replace('\xa0','')
body=body.replace('下载次数','')
body=body.replace('下载附件','')
body=body.replace('上传','')
body
driver.find_element_by_class_name('ant-pagination-next').click()
soup.find_all(class_='y')
page=[1425,1411,1399,1391,1383,1380,1369,1366,1355,1346,1324,1301,1290,1283,1281,1248,1246,1225,1222,1215,1210,1195,1174,1172,1141,1129,1114,
      1102,1093,1089]
paper=[]
for i in page:
    url = 'http://www.chntsxz.cn/forum.php?mod=viewthread&tid={}'.format(i)
    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(5)
    html = driver.page_source
    soup = bs(html,'html.parser')
    title = soup.title
    title = title.string
    print(title)
    body = soup.find(class_='t_fsz')
    body = body.text
    body=body.replace('\n','')
    body=body.replace('\xa0','')
    body=body.replace('下载次数','')
    body=body.replace('下载附件','')
    body=body.replace('上传','')
    print(body)
    paper.append(title)
    paper.append(body)
    with open(r'D:\特色小镇.txt', 'a') as f:
        f.write(title)
        f.write(body)
        f.write('\n')
    driver.quit()
title_all=[]
for i in range(154):
    print(paper[2*i])
    title_all.append(paper[2*i])
paper_all=[]
for i in range(154):
    paper_all.append(paper[2*i+1])
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值