从给定网站抓取省市级城市数据

<?php
/**
* 名称: 抓取省市级城市
*
* 作者: 心灯QQ: 115434745 Email: bjbs_270@163.com
*
* 完成日期: 2005-9-12 14:00
*
* 功能: 从所给指定的网址中抓取数据并分析出自己想要的数据。
*
* 版权声明: 你可以无任何限制的发行、传播、修改
*
*/
/*
SQL:
CREATE TABLE province_city (
id int(11) unsigned NOT NULL auto_increment,
city varchar(200) character set latin1 collate latin1_bin NOT NULL default '',
parent_id smallint(4) unsigned NOT NULL default '0',
city_code varchar(10) character set latin1 collate latin1_bin NOT NULL default '',
PRIMARY KEY (id),
KEY parent_id (parent_id),
KEY city_code (city_code)
) ENGINE=MyISAM
*/
/*
@取得程序执行的时间微秒
*/
function getMicrotime()
{
list($usec, $sec) = explode(" ",microtime());
return ((double)$usec + (double)$sec);
}
/*
@过滤字符串取得需要的值
*/
function filterData($data){
global $pre;
$start_len = strpos($data,"=")+1;
$end_len = strpos($data,">");
$len = $end_len-$start_len;
$url = substr($data,$start_len,$len);
//$tmp = implode(file($pre.$url));
$tmp = getDataFromUrl($pre.$url);
return $tmp;
}
/*
@取得指定网站上的数据
*/
function getDataFromUrl($url){
$data = implode("",file($url));
$data = strip_tags($data,"<a>");
preg_match_all ("/(<([/w]+)[^>]*>)(.*)(<////2>)/", $data, $matches);
return $matches;
}

set_time_limit(0);
$startTime = getMicrotime();
$conn = mysql_connect("localhost","root","");
mysql_select_db("365tag",$conn);
$sql = "INSERT INTO province_city (id,city, parent_id) VALUES ";
$pre = "http://bjrd.beijing.gov.cn/life/life_com/code/";
$url = "http://bjrd.beijing.gov.cn/life/life_com/code/city.asp";
$matches = getDataFromUrl($url);
global $id_num;
$id_num = 0;
for ($i=0; $i<count($matches[0]); $i++) {
$id = !empty($id_num)?($id_num+$i+1):($i+1);
echo "id_num: ".$id_num."<br>";
echo "id:".$id."<br>";
$j = $i+1;
echo "<li><p>".$j.".".$matches[3][$i]."</p></li>";

$sql .= "('".$id."','".$matches[3][$i]."',0),";

$city = filterData($matches[1][$i]);
for($k=0;$k<count($city[0]);$k++){
unset($id_tmp);
$id_tmp= $id+$k+1;
//echo "id_tmp:".$id_tmp."<br>";
$z = $k+1;
$sql .= "('".$id_tmp."','".$city[3][$k]."','".$id."'),";

echo $z.".".$city[3][$k];
echo "<br>";
}
echo "<hr>";
$id_num = $id_num+count($city[0]);
}
echo "SQL:<BR>".$sql;
echo "<hr>";

$endTime = getMicrotime();
$execTime = $endTime-$startTime;
echo "<font size=2 color=blue>抓取及分析数据所用时间:".$execTime."</font>";
//开始执行添加数据库的程序
$len_tmp = strrpos($sql,',');
$sql = substr($sql,0,$len_tmp).";";
mysql_query($sql) or die(mysql_error());
?>

从国家统计局抓取的地图省市区划代码和城划分代码(最新2020/06/03),共596071条数据。来源于国家统计局http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/。 数据结构: CREATE TABLE `area` ( `areaid` varchar(255) COLLATE utf8_unicode_ci NOT NULL, `area_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, `fatherid` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, `area_type` int(255) DEFAULT NULL COMMENT '区域代码:\r\n100 :城镇,110:城区,111 :主城区,112 :城乡结合区,120 :镇区,121 :镇中心区,122:镇乡结合区,123:特殊区域200 :乡村,210:乡中心区,220:村庄\r\n\r\n', `is_delete` int(255) DEFAULT '0', PRIMARY KEY (`areaid`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 部分数据: INSERT INTO `area` VALUES ('110000000000','北京市',NULL,NULL,0); INSERT INTO `area` VALUES ('110100000000','市辖区','110000000000',NULL,0); INSERT INTO `area` VALUES ('110101000000','东城区','110100000000',NULL,0); INSERT INTO `area` VALUES ('110101001000','东华门街道','110101000000',NULL,0); INSERT INTO `area` VALUES ('110101001001','多福巷社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001002','银闸社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001005','东厂社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001006','智德社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001007','南池子社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001008','黄图岗社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001009','灯市口社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001010','正义路社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001011','甘雨社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001013','台基厂社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001014','韶九社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001015','王府井社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101002000','景山街道','110101000000',NULL,0); INSERT INTO `area` VALUES ('110101002001','隆福寺社区居委会','110101002000',111,0); INSERT INTO `area` VALUES ('110101002002','吉祥社区居
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值