直接上源码:
https://github.com/huahuizi/Iask-crawl
# iask 采集的主文件 # coding:utf-8 import requests, re from pyquery import PyQuery as pq from requests.exceptions import RequestException from fake_useragent import UserAgent SITE = "http://iask.sina.com.cn" from bs4 import BeautifulSoup as bs4 from Tools import Tool import pymongo from config import * headers = { 'User-Agent':UserAgent().chrome, } def getHtml(url): try: res = requests.get(url,headers=headers) if res.status_code == 200: return res.text return None except RequestException: return None def get_next_url_one(content): try: return SITE+bs4(content,"lxml").select('.btn-page')[-1]['href'] except Exception as e: print(e) pass def get_list_href(url): html1 = getHtml(url) doc = pq(html1) for a in doc('.question-title a').items(): page_url = SITE+a.attr.href get_detail_page(page_url) print(a.text()) nex = get_next_url_one(html1) print('正在抓取',nex) get_list_href(nex) def get_detail_page(url): html = getHtml(url) newselect = bs4(html, "lxml").select('pre') try: title = Tool().replace(newselect[0].text) answer = Tool().replace(newselect[1].text) save_mongo({'title':title,'answer':answer}) except Exception as e: print(e) pass def save_mongo(dic): clent = pymongo.MongoClient(host='localhost',port=27017) clent.MONGO_DB.MONGO_TB.insert_one(dic) print("正在保存",dic) # pass if __name__ == '__main__': url = "http://iask.sina.com.cn/c/213-goodAnswer-180-new.html" html = getHtml(url) nextii = get_next_url_one(html) get_list_href(nextii)
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
|
# iask 采集的主文件
# coding:utf-8
import
requests
,
re
from
pyquery
import
PyQuery
as
pq
from
requests
.
exceptions
import
RequestException
from
fake_useragent
import
UserAgent
SITE
=
"http://iask.sina.com.cn"
from
bs4
import
BeautifulSoup
as
bs4
from
Tools
import
Tool
import
pymongo
from
config
import
*
headers
=
{
'User-Agent'
:
UserAgent
(
)
.
chrome
,
}
def
getHtml
(
url
)
:
try
:
res
=
requests
.
get
(
url
,
headers
=
headers
)
if
res
.
status_code
==
200
:
return
res
.
text
return
None
except
RequestException
:
return
None
def
get_next_url_one
(
content
)
:
try
:
return
SITE
+
bs4
(
content
,
"lxml"
)
.
select
(
'.btn-page'
)
[
-
1
]
[
'href'
]
except
Exception
as
e
:
print
(
e
)
pass
def
get_list_href
(
url
)
:
html1
=
getHtml
(
url
)
doc
=
pq
(
html1
)
for
a
in
doc
(
'.question-title a'
)
.
items
(
)
:
page_url
=
SITE
+
a
.
attr
.
href
get_detail_page
(
page_url
)
print
(
a
.
text
(
)
)
nex
=
get_next_url_one
(
html1
)
print
(
'正在抓取'
,
nex
)
get_list_href
(
nex
)
def
get_detail_page
(
url
)
:
html
=
getHtml
(
url
)
newselect
=
bs4
(
html
,
"lxml"
)
.
select
(
'pre'
)
try
:
title
=
Tool
(
)
.
replace
(
newselect
[
0
]
.
text
)
answer
=
Tool
(
)
.
replace
(
newselect
[
1
]
.
text
)
save_mongo
(
{
'title'
:
title
,
'answer'
:
answer
}
)
except
Exception
as
e
:
print
(
e
)
pass
def
save_mongo
(
dic
)
:
clent
=
pymongo
.
MongoClient
(
host
=
'localhost'
,
port
=
27017
)
clent
.
MONGO_DB
.
MONGO_TB
.
insert_one
(
dic
)
print
(
"正在保存"
,
dic
)
# pass
if
__name__
==
'__main__'
:
url
=
"http://iask.sina.com.cn/c/213-goodAnswer-180-new.html"
html
=
getHtml
(
url
)
nextii
=
get_next_url_one
(
html
)
get_list_href
(
nextii
)
|
# Tools 采集内容处理文件 #coding:utf-8 import re #处理页面标签类 class Tool: #将超链接广告剔除 removeADLink = re.compile(' <div class="link_layer.*?</div> ') #去除img标签,1-7位空格, removeImg = re.compile('<img.*?>| {1,7}| ') #删除超链接标签 removeAddr = re.compile('<a.*?>|</a>') #把换行的标签换为\n replaceLine = re.compile(' <tr>| <div>|</div> | ') #将表格制表 <td> 替换为\t replaceTD= re.compile(' <td>') #将换行符或双换行符替换为\n replaceBR = re.compile(' | ') #将其余标签剔除 removeExtraTag = re.compile('<.*?>') #将多行空行删除 removeNoneLine = re.compile('\n+') def replace(self,x): x = re.sub(self.removeADLink,"",x) x = re.sub(self.removeImg,"",x) x = re.sub(self.removeAddr,"",x) x = re.sub(self.replaceLine,"\n",x) x = re.sub(self.replaceTD,"\t",x) x = re.sub(self.replaceBR,"\n",x) x = re.sub(self.removeExtraTag,"",x) x = re.sub(self.removeNoneLine,"\n",x) #strip()将前后多余内容删除 return x.strip()
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
|
# Tools 采集内容处理文件
#coding:utf-8
import
re
#处理页面标签类
class
Tool
:
#将超链接广告剔除
removeADLink
=
re
.
compile
(
'
<div class="link_layer.*?</div>
'
)
#去除img标签,1-7位空格,
removeImg
=
re
.
compile
(
'<img.*?>| {1,7}| '
)
#删除超链接标签
removeAddr
=
re
.
compile
(
'<a.*?>|</a>'
)
#把换行的标签换为\n
replaceLine
=
re
.
compile
(
'
<tr>|
<div>|</div>
|
'
)
#将表格制表
<
td
>
替换为
\
t
replaceTD
=
re
.
compile
(
'
<td>'
)
#将换行符或双换行符替换为\n
replaceBR
=
re
.
compile
(
'
|
'
)
#将其余标签剔除
removeExtraTag
=
re
.
compile
(
'<.*?>'
)
#将多行空行删除
removeNoneLine
=
re
.
compile
(
'\n+'
)
def
replace
(
self
,
x
)
:
x
=
re
.
sub
(
self
.
removeADLink
,
""
,
x
)
x
=
re
.
sub
(
self
.
removeImg
,
""
,
x
)
x
=
re
.
sub
(
self
.
removeAddr
,
""
,
x
)
x
=
re
.
sub
(
self
.
replaceLine
,
"\n"
,
x
)
x
=
re
.
sub
(
self
.
replaceTD
,
"\t"
,
x
)
x
=
re
.
sub
(
self
.
replaceBR
,
"\n"
,
x
)
x
=
re
.
sub
(
self
.
removeExtraTag
,
""
,
x
)
x
=
re
.
sub
(
self
.
removeNoneLine
,
"\n"
,
x
)
#strip()将前后多余内容删除
return
x
.
strip
(
)
|
#config 数据库 配置文件 MONGO_DB="IASK" MONGO_TB="AUSWER"
|
1
2
3
|
#config 数据库 配置文件
MONGO_DB
=
"IASK"
MONGO_TB
=
"AUSWER"
|

2335

被折叠的 条评论
为什么被折叠?



