spider-web 是爬虫的网页版,使用xml配置,支持大部分页面的爬取,支持爬取内容的保存、下载等。
其中配置文件格式为:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
<?
xml
version
=
"1.0"
encoding
=
"UTF-8"
?>
<
content
>
<
url
type
=
"simple"
>
<!-- simple/complex -->
<
url_head
>http://www.oschina.net/tweets</
url_head
>
<
url_start
></
url_start
>
<
url_end
></
url_end
>
<
url_suffix
></
url_suffix
>
</
url
>
<
analysis
type
=
"list"
>
<!-- single/list -->
<
elem
name
=
"title"
>
<
attr
type
=
"key"
num
=
"1"
>
<!-- tag/class/key -->
<
name
>link</
name
>
<!-- $http://my.oschina.net/(.)* -->
<
pro
>http://my.oschina.net/(.)*/[0-9]*</
pro
>
</
attr
>
<
attr
type
=
"class"
num
=
"2"
>
<!-- tag/class/key -->
<
name
>tweet</
name
>
<
pro
>a</
pro
>
</
attr
>
<
attr
type
=
"class"
num
=
"3"
>
<
name
>txt</
name
>
<
pro
>a</
pro
>
</
attr
>
<
attr
type
=
"tag"
num
=
"4"
>
<
name
>a</
name
>
<
pro
>a</
pro
>
</
attr
>
</
elem
>
<
elem
name
=
"content"
>
<
attr
type
=
"key"
num
=
"1"
>
<!-- tag/class/key -->
<
name
>link</
name
>
<!-- $http://my.oschina.net/(.)* -->
<
pro
>http://my.oschina.net/(.)*/[0-9]*</
pro
>
</
attr
>
<
attr
type
=
"class"
num
=
"2"
>
<!-- tag/class/key -->
<
name
>tweet</
name
>
<
pro
>a</
pro
>
</
attr
>
<
attr
type
=
"class"
num
=
"3"
>
<
name
>txt</
name
>
<
pro
>a</
pro
>
</
attr
>
</
elem
>
</
analysis
>
<
target
type
=
"download"
>
<!-- download/text -->
</
target
>
</
content
>
|
根据不同的页面进行设置,可以支持比较流行的页面爬取。