python爬虫爬取python高级教程到单独的txt文件

最新推荐文章于 2025-07-23 18:06:16 发布

原创最新推荐文章于 2025-07-23 18:06:16 发布 · 800 阅读

0 ·

CC 4.0 BY-SA版权

Python 专栏收录该内容

2 篇文章

订阅专栏

本文介绍了一种爬取Python中文社区教程的方法，包括获取页面源码、解析HTML标签、提取文章标题及内容，并将其保存为本地文件的过程。

根据视频教程学习的，由于时间久了，不知道看的那个教程了。。。

入口：http://www.pythontab.com/html/pythonjichu/
下载python中文社区教程：
1.获取页面源码
2.找到标题跟内容所对应的表签
3.把标题和所对应的内容放一起
4.如何打印文章内容
5.打印标题
6.写入文件     文件名为标题      open函数

#_*_ coding:utf-8 _*_
import urllib2
from bs4 import BeautifulSoup
from spyder.utils.help.conf import html_context

url='http://www.pythontab.com/html/pythonhexinbiancheng/'
url_list = [url]  #链接放入列表，多页的效果
for i in range(2,21):
    #url_list.append('http://www.pythontab.com/html/pythonhexinbiancheng/%s.html'%i)
    url_list.append('http://www.pythontab.com/html/pythonhexinbiancheng/{}.html'.format(i))


source_list = []#存放标题和文字
for j in url_list:
    request = urllib2.urlopen(j)#打开网址
    html = request.read() #获取所有源码
    #print html
    soup = BeautifulSoup(html,'html.parser')#解析方式
    titles = soup.select('#catlist > li > a')#select:css选择器  从所找内容的最近的一个具有id值的属性开始寻找
    #print titles
    #获取内容，获取到a标签的超链接
    links = soup.select('#catlist > li > a')#和titles碰巧一样
    #print links
    for title,link in zip(titles,links):
        data={
            "title" : title.get_text(),#获取标题文本
            "link" : link.get('href')#获取文章的超链接
            }
        #print data     #内容+标题             中文在可迭代对象里面就是Unicode编码
        source_list.append(data)#把data追加到空列表中
    
    #获取文章内容
    for l in source_list:    
        request = urllib2.urlopen(l['link'])#打开连接
        html_cont = request.read()#获取源码
        soup_cont = BeautifulSoup(html_cont,'html.parser',from_encoding="utf-8")
        text_p = soup_cont.select('div.content > p')#查找到内容
        #print text_p#内容
        text = []#放置文章内容
        for t in text_p:
            #print t
            #print t.get_text()
            text.append(t.get_text().encode('utf-8'))#追加到空列表中
    
        #标题
        title_text = l['title']#css选择器获取文本内容
        title_text = title_text.replace('*','').replace('/','or').replace('"',' ').replace('?','wenhao').replace(':',' ')
        #print title_text  #打印标题
        
        #写入文件
        #open("路径",'模式')
        with open("F:/reptile_Python/python_spider/download/%s.txt" %title_text ,'wb') as f:  #wb：以二进制模式写
            for a in text:
                f.write(a)#写入