CREATE TABLE `webpage` (
`id` varchar(250) NOT NULL,
`headers` blob,
`text` mediumtext,
`status` int(11) DEFAULT NULL,
`markers` blob,
`parseStatus` blob,
`modifiedTime` bigint(20) DEFAULT NULL,
`prevModifiedTime` bigint(20) DEFAULT NULL,
`score` float DEFAULT NULL,
`typ` varchar(32) CHARACTER SET latin1 DEFAULT NULL,
`batchId` varchar(32) CHARACTER SET latin1 DEFAULT NULL,
`baseUrl` varchar(767) DEFAULT NULL,
`content` longblob,
`title` varchar(2048) DEFAULT NULL,
`reprUrl` varchar(767) DEFAULT NULL,
`fetchInterval` int(11) DEFAULT NULL,
`prevFetchTime` bigint(20) DEFAULT NULL,
`inlinks` mediumblob,
`prevSignature` blob,
`outlinks` mediumblob,
`fetchTime` bigint(20) DEFAULT NULL,
`retriesSinceFetch` int(11) DEFAULT NULL,
`protocolStatus` blob,
`signature` blob,
`metadata` blob,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 ROW_FORMAT=COMPRESSED;
id
headers
2 fetched (page was successfully fetched)
3 gone (that page no longer exists)
4 redir_temp (temporary redirection — see reprUrl below for more details)
5 redir_perm (permanent redirection — see reprUrl below for more details)
34 retry
38 not modified
markers
parseStatus
modifiedTime
score
typ
batchId
baseUrl
content
title
reprUrl
fetchInterval
prevFetchTime
inlinks
prevSignature
outlinks
fetchTime
retriesSinceFetch
signature
metadata
自定义元数据,可以在种子文件里面加,例如: http://xxxx/xxx.html \t type=news
protocolStatus
ACCESS_DENIED 17
BLOCKED 23
EXCEPTION 16
FAILED 2
GONE 11
MOVED 12
NOTFETCHING 20
NOTFOUND 14
NOTMODIFIED 21
PROTO_NOT_FOUND 10
REDIR_EXCEEDED 19
RETRY 15
ROBOTS_DENIED 18
SUCCESS 1
TEMP_MOVED 13
WOULDBLOCK 22
本文介绍了一个用于描述网页抓取状态的数据库表结构,详细解释了每个字段的作用及重要性,包括主键ID的生成方式、状态码含义等,并提供了网页重要性的计算方法。
4648

被折叠的 条评论
为什么被折叠?



