python钓鱼评论爬取

最新推荐文章于 2024-10-21 13:42:07 发布

一青CSU

最新推荐文章于 2024-10-21 13:42:07 发布

阅读量334

点赞数

分类专栏： # python爬虫文章标签： python 爬虫

本文链接：https://blog.youkuaiyun.com/weixin_42348202/article/details/106232463

版权

python爬虫专栏收录该内容

2 篇文章

订阅专栏

python钓鱼评论爬取并存入txt

import re#python正则分割
# from  bs4 import BeautifulSoup#网页美味汤
#from selenium import webdriver#模拟点击鼠标点击网页库
#import time#时间
import  requests#直接爬取网页库
def direct(secname):
    r=requests.get(secname)
    html=r.text
    rule="http://www.diaoyu.com/diaochang/changsha/list-.*-4-1-1.html"
    gg=re.findall(rule,html)
    return gg
def direct2(secname):
    r=requests.get(secname)
    html=r.text
    rule="http://www.diaoyu.com/diaochang/changsha/\d+.html"#消除数字
    gg=re.findall(rule,html)
    gg=list(set(gg))#去除重复
    return gg
def direct3(secname):
    r=requests.get(secname)
    html=r.text
    rule="</span>\d分\n</span>\n<p>.*</p>"#消除数字
    gg=re.findall(rule,html)
    gg=list(set(gg))#去除重复
    return gg

def main():
    gg=direct("http://www.diaoyu.com/diaochang/changsha/list-0-4-1-1.html")
    print(gg)
    allcomments=[]
    for i in range(2,len(gg)):
        #第一个芙蓉区啥都没
            print(gg[i])
            gg2=direct2(gg[i])
            # print(len(gg2))
            for j in range(len(gg2)):
                    gg3=direct3(gg2[j])
                    allcomments.append(gg3)
    outfiles='E:\\comments'+'.txt'
    output= open(outfiles,'w',encoding='utf-8')
    for i in allcomments:
        for j in i:
            print(str(j[7:9]))
            output.write(str(j[7:9]))
            output.write("\t")
            output.write(str(j[21:-4]))
            output.write('\t')
            output.write('\n')
#调用main()函数
if __name__ == '__main__':
    main()