网站信息采集

最新推荐文章于 2024-02-27 04:56:08 发布

原创最新推荐文章于 2024-02-27 04:56:08 发布 · 379 阅读

1 ·

CC 4.0 BY-SA版权

php 专栏收录该内容

47 篇文章

订阅专栏

此博客详细介绍了使用PHP爬取指定目录下.html文件，并对其进行解析，提取标题、内容、类别、发布日期等关键信息的过程。通过遍历目录结构，实现对网页信息的有效抓取与结构化存储。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

<?php
/**
* @MT制作的爬虫
*/
header("Content-type: text/html; charset=utf-8");
$dir = "html"; // 要获取的目录
$num = 0;
movePath($dir);

function movePath($dir)
{
if (is_dir($dir)) {

if ($dh = opendir($dir)) {

while (($file = readdir($dh)) != false) {

if ($file != "." && $file != "..") {

if (strpos($file, ".html")) {
if (is_numeric(substr($file, 0, strpos($file, ".html")))) {
fillRead($dir . "/" . $file);
}
} else {
movePath($dir . "/" . $file);
}
}
}
closedir($dh);
}
}
}

/*
* $conn = mysql_connect("localhost","root","");
* if (!$conn){
* die('Could not connect: ' . mysql_error());
* }
*/
function fillRead($file)
{
$array = array();
$myfile = fopen($file, "r") or die("Unable to open file!");
$return = fread($myfile, filesize($file));

$title = "/<div class=\"title\">(.*?)<\/div>/ism";
if (preg_match_all($title, $return, $matches)) {
$array["title"] = $matches[1][0];
}

$content = "/<div class=\"arcContent\">(.*?)<\/div>/ism";
if (preg_match_all($content, $return, $matches)) {
$array["countent"] = $matches[1][0];
}

$category = "/class=\"box2.*?labox.*?mr_5.*?f_l\">.*?<a.*?href=\'.*?\'.*?>(.*?)<\/a>.*?<a.*?href=\'.*?\'.*?>(.*?)<\/a>.*?<a.*?href=\'.*?\'.*?>(.*?)<\/a>.*?>/ism";

if (preg_match_all($category, $return, $matches)) {
$array["category"] = $matches[2][0];
$array["parentcategory"] = $matches[3][0];

}

$send = "/class=\"info\">.*?<small>(.*?)<\/div>/ism";
if (preg_match_all($send, $return, $matches)) {

foreach (explode("  ", $matches[1][0]) as $key => $val) {
$text = str_replace("\t", "", $val);
$text = str_replace("\r\n", "", $text);

$small = "/(<\/small>)+(.*)/i";
preg_match_all($small, $text, $matches);
if (! empty($matches[2])) {

switch ($key) {
case 0: //
$array["public_time"] = strtotime($matches[2][0]);
break;
case 1:
$array["laiyuan"] = $matches[2][0];
break;
case 2:
$array["author"] = $matches[2][0];
break;
case 3:

break;
}
}
}

}

$_SERVER["xixi"][] = $array;
fclose($myfile);
}


$conn = @mysql_connect("localhost","root","");
if (!$conn){
die('Could not connect: ' . mysql_error());
}

mysql_select_db("cmysw",$conn);
mysql_query("set names utf8");
$cid;
$pid;

foreach ($_SERVER["xixi"] as $v=>$row){
$mysql= @mysql_query("select * from cx_category where name like '%".$row["category"]."%'");
$mysqlparent= @mysql_query("select * from cx_category where name like '%".$row["parentcategory"]."%'");

while ($val=mysql_fetch_array($mysql)){
global $cid;
$cid=$val["id"];
}

while ($val1=mysql_fetch_array($mysqlparent)){
global $pid;
$pid=$val1["id"];
}

if(!empty($pid)){
$summary=preg_replace("/(\s|\&nbsp\;|　|\xc2\xa0)/", "", strip_tags($row["countent"]));
$summary= mb_substr($summary,0,50,'utf-8');
if($row["title"]!=""){
$cond=mysql_query("insert into cx_article(cid,title,title_color,author,editor,summary,content,publish_date,create_date,keywords,article_view,is_check,is_pass,create_user)
values('$pid','{$row["title"]}','','{$row["author"]}','{$row["laiyuan"]}','$summary','{$row["countent"]}','{$row["public_time"]}','{$row["public_time"]}','','','1','1','1')",$conn);
}
}

}

?>