python爬虫_获取知乎最多100篇文章

最新推荐文章于 2025-10-23 01:42:57 发布

转载最新推荐文章于 2025-10-23 01:42:57 发布 · 2.5k 阅读

文章标签：

#爬虫

python 专栏收录该内容

20 篇文章

订阅专栏

本文介绍了一个使用Python编写的简单知乎爬虫程序，该程序能够抓取知乎上的精选文章并保存到本地文件中。虽然代码存在一些问题，比如超出最大循环嵌套数的限制，但对于初学者来说仍是一个不错的实践案例。

这几天经常上知乎，觉得里面有些文章或者回答确实不错。就花了晚上时间写了这个爬虫。以前没有用Python写过独立的程序，所以这个程序bug比较多。现在贴出的代码可以运行，会在同级目录上生成zhihu_jingxuan.txt，该txt中就是爬取的文章。主要的问题是，当爬取的文章过多时，就会报超出最大循环嵌套数的错误。简单的查了一下，python最大允许的循环前套数是10000。用到了beautifulsoup库，觉得它里面获取标签的时候应该是用了迭代，导致超出了最大循环数。再次记录一下，有空看看源码。

[python] view plain copy

print ?

#coding:utf-8
import urllib
from bs4 import BeautifulSoup
import re
url = ”http://www.zhihu.com”
filename = ”zhihu_jingxuan.txt”
def parseArticleFromHtml(html):
soup = BeautifulSoup(html)
result = ”<<”+soup.html.head.title.string+“>>\r\n”
for i in soup.findAll(‘div’,{‘class’:‘zm-editable-content’}):
tmp = i
if tmp is not None:
tmp2 = str(tmp)
tmp3 = re.sub(’<[^>]+>’,“\r\n”,tmp2)
result += ”*************************\r\n”
# try:
result += tmp3
result +=”\r\n”
# except:
# continue
result +=”<><><><><><><><><><>”
for ii in range(5):
result = result.replace(”\r\n\r\n”,“\r\n”)
return result
def parseArticleFromLink(link):
print link
html = urllib.urlopen(link)
content = html.read()
html.close()
# try:
article_string = parseArticleFromHtml(content)
myfilewriter = file(filename,’a+’)
myfilewriter.write(”\r\n”)
myfilewriter.write(article_string)
myfilewriter.close()
# except UnicodeEncodeError:
# pass
return
mylist = []
html = urllib.urlopen(url)
content = html.read()
html.close()
soup = BeautifulSoup(content)
info_cards = soup.findAll(’a’,{‘class’:‘rep’})
for an_info_cards in info_cards:
print an_info_cards.span.string
newlink = url+dict(an_info_cards.attrs)[”href”]
newhtml = urllib.urlopen(newlink)
newcontent = newhtml.read()
newhtml.close()
newsoup = BeautifulSoup(newcontent)
question_links = newsoup.findAll(’a’,{‘class’:‘question_link’})
for a_question_link in question_links:
article_link = url+dict(a_question_link.attrs)[”href”]
# parseArticleFromLink(article_link)
if “answer” in article_link:
mylist.append(article_link)
print len(mylist)
counter = 100
if(len(mylist)>counter):
for item in range(counter):
print item
parseArticleFromLink(mylist[item])
else:
for item in mylist:
parseArticleFromLink(item)

#coding:utf-8
import urllib
from bs4 import BeautifulSoup
import re

url = "http://www.zhihu.com"
filename = "zhihu_jingxuan.txt"

def parseArticleFromHtml(html):
    soup = BeautifulSoup(html)
    result = "<<"+soup.html.head.title.string+">>\r\n"

    for i in soup.findAll('div',{'class':'zm-editable-content'}):
        tmp = i
        if tmp is not None:
            tmp2 = str(tmp)
            tmp3 = re.sub('<[^>]+>',"\r\n",tmp2)
            result += "*************************\r\n"

            result += tmp3
            result +="\r\n"         

    result +="<><><><><><><><><><>"
    for ii in range(5):
        result = result.replace("\r\n\r\n","\r\n")
    return result

def parseArticleFromLink(link):
    print link
    html = urllib.urlopen(link)
    content = html.read()
    html.close()


    article_string = parseArticleFromHtml(content)
    myfilewriter = file(filename,'a+')  
    myfilewriter.write("\r\n")
    myfilewriter.write(article_string)  
    myfilewriter.close()


    return

mylist = []
html = urllib.urlopen(url)
content = html.read()
html.close()
soup = BeautifulSoup(content)
info_cards = soup.findAll('a',{'class':'rep'})
for an_info_cards in info_cards:
    print an_info_cards.span.string
    newlink = url+dict(an_info_cards.attrs)["href"]
    newhtml = urllib.urlopen(newlink)
    newcontent = newhtml.read()
    newhtml.close()
    newsoup = BeautifulSoup(newcontent)
    question_links = newsoup.findAll('a',{'class':'question_link'})
    for a_question_link in question_links:
        article_link = url+dict(a_question_link.attrs)["href"]

        if "answer" in article_link:
            mylist.append(article_link)


print len(mylist)
counter = 100
if(len(mylist)>counter):
    for item in range(counter):
        print item
        parseArticleFromLink(mylist[item])
else:
    for item in mylist:
        parseArticleFromLink(item)