<?php
/**
* @MT制作的爬虫
*/
header("Content-type: text/html; charset=utf-8");
$dir = "html"; // 要获取的目录
$num = 0;
movePath($dir);
function movePath($dir)
{
if (is_dir($dir)) {
if ($dh = opendir($dir)) {
while (($file = readdir($dh)) != false) {
if ($file != "." && $file != "..") {
if (strpos($file, ".html")) {
if (is_numeric(substr($file, 0, strpos($file, ".html")))) {
fillRead($dir . "/" . $file);
}
} else {
movePath($dir . "/" . $file);
}
}
}
closedir($dh);
}
}
}
/*
* $conn = mysql_connect("localhost","root","");
* if (!$conn){
* die('Could not connect: ' . mysql_error());
* }
*/
function fillRead($file)
{
$array = array();
$myfile = fopen($file, "r") or die("Unable to open file!");
$return = fread($myfile, filesize($file));
$title = "/<div class=\"title\">(.*?)<\/div>/ism";
if (preg_match_all($title, $return, $matches)) {
$array["title"] = $matches[1][0];
}
$content = "/<div class=\"arcContent\">(.*?)<\/div>/ism";
if (preg_match_all($content, $return, $matches)) {
$array["countent"] = $matches[1][0];
}
$category = "/class=\"box2.*?labox.*?mr_5.*?f_l\">.*?<a.*?href=\'.*?\'.*?>(.*?)<\/a>.*?<a.*?href=\'.*?\'.*?>(.*?)<\/a>.*?<a.*?href=\'.*?\'.*?>(.*?)<\/a>.*?>/ism";
if (preg_match_all($category, $return, $matches)) {
$array["category"] = $matches[2][0];
$array["parentcategory"] = $matches[3][0];
}
$send = "/class=\"info\">.*?<small>(.*?)<\/div>/ism";
if (preg_match_all($send, $return, $matches)) {
foreach (explode(" ", $matches[1][0]) as $key => $val) {
$text = str_replace("\t", "", $val);
$text = str_replace("\r\n", "", $text);
$small = "/(<\/small>)+(.*)/i";
preg_match_all($small, $text, $matches);
if (! empty($matches[2])) {
switch ($key) {
case 0: //
$array["public_time"] = strtotime($matches[2][0]);
break;
case 1:
$array["laiyuan"] = $matches[2][0];
break;
case 2:
$array["author"] = $matches[2][0];
break;
case 3:
break;
}
}
}
}
$_SERVER["xixi"][] = $array;
fclose($myfile);
}
$conn = @mysql_connect("localhost","root","");
if (!$conn){
die('Could not connect: ' . mysql_error());
}
mysql_select_db("cmysw",$conn);
mysql_query("set names utf8");
$cid;
$pid;
foreach ($_SERVER["xixi"] as $v=>$row){
$mysql= @mysql_query("select * from cx_category where name like '%".$row["category"]."%'");
$mysqlparent= @mysql_query("select * from cx_category where name like '%".$row["parentcategory"]."%'");
while ($val=mysql_fetch_array($mysql)){
global $cid;
$cid=$val["id"];
}
while ($val1=mysql_fetch_array($mysqlparent)){
global $pid;
$pid=$val1["id"];
}
if(!empty($pid)){
$summary=preg_replace("/(\s|\ \;| |\xc2\xa0)/", "", strip_tags($row["countent"]));
$summary= mb_substr($summary,0,50,'utf-8');
if($row["title"]!=""){
$cond=mysql_query("insert into cx_article(cid,title,title_color,author,editor,summary,content,publish_date,create_date,keywords,article_view,is_check,is_pass,create_user)
values('$pid','{$row["title"]}','','{$row["author"]}','{$row["laiyuan"]}','$summary','{$row["countent"]}','{$row["public_time"]}','{$row["public_time"]}','','','1','1','1')",$conn);
}
}
}
?>
/**
* @MT制作的爬虫
*/
header("Content-type: text/html; charset=utf-8");
$dir = "html"; // 要获取的目录
$num = 0;
movePath($dir);
function movePath($dir)
{
if (is_dir($dir)) {
if ($dh = opendir($dir)) {
while (($file = readdir($dh)) != false) {
if ($file != "." && $file != "..") {
if (strpos($file, ".html")) {
if (is_numeric(substr($file, 0, strpos($file, ".html")))) {
fillRead($dir . "/" . $file);
}
} else {
movePath($dir . "/" . $file);
}
}
}
closedir($dh);
}
}
}
/*
* $conn = mysql_connect("localhost","root","");
* if (!$conn){
* die('Could not connect: ' . mysql_error());
* }
*/
function fillRead($file)
{
$array = array();
$myfile = fopen($file, "r") or die("Unable to open file!");
$return = fread($myfile, filesize($file));
$title = "/<div class=\"title\">(.*?)<\/div>/ism";
if (preg_match_all($title, $return, $matches)) {
$array["title"] = $matches[1][0];
}
$content = "/<div class=\"arcContent\">(.*?)<\/div>/ism";
if (preg_match_all($content, $return, $matches)) {
$array["countent"] = $matches[1][0];
}
$category = "/class=\"box2.*?labox.*?mr_5.*?f_l\">.*?<a.*?href=\'.*?\'.*?>(.*?)<\/a>.*?<a.*?href=\'.*?\'.*?>(.*?)<\/a>.*?<a.*?href=\'.*?\'.*?>(.*?)<\/a>.*?>/ism";
if (preg_match_all($category, $return, $matches)) {
$array["category"] = $matches[2][0];
$array["parentcategory"] = $matches[3][0];
}
$send = "/class=\"info\">.*?<small>(.*?)<\/div>/ism";
if (preg_match_all($send, $return, $matches)) {
foreach (explode(" ", $matches[1][0]) as $key => $val) {
$text = str_replace("\t", "", $val);
$text = str_replace("\r\n", "", $text);
$small = "/(<\/small>)+(.*)/i";
preg_match_all($small, $text, $matches);
if (! empty($matches[2])) {
switch ($key) {
case 0: //
$array["public_time"] = strtotime($matches[2][0]);
break;
case 1:
$array["laiyuan"] = $matches[2][0];
break;
case 2:
$array["author"] = $matches[2][0];
break;
case 3:
break;
}
}
}
}
$_SERVER["xixi"][] = $array;
fclose($myfile);
}
$conn = @mysql_connect("localhost","root","");
if (!$conn){
die('Could not connect: ' . mysql_error());
}
mysql_select_db("cmysw",$conn);
mysql_query("set names utf8");
$cid;
$pid;
foreach ($_SERVER["xixi"] as $v=>$row){
$mysql= @mysql_query("select * from cx_category where name like '%".$row["category"]."%'");
$mysqlparent= @mysql_query("select * from cx_category where name like '%".$row["parentcategory"]."%'");
while ($val=mysql_fetch_array($mysql)){
global $cid;
$cid=$val["id"];
}
while ($val1=mysql_fetch_array($mysqlparent)){
global $pid;
$pid=$val1["id"];
}
if(!empty($pid)){
$summary=preg_replace("/(\s|\ \;| |\xc2\xa0)/", "", strip_tags($row["countent"]));
$summary= mb_substr($summary,0,50,'utf-8');
if($row["title"]!=""){
$cond=mysql_query("insert into cx_article(cid,title,title_color,author,editor,summary,content,publish_date,create_date,keywords,article_view,is_check,is_pass,create_user)
values('$pid','{$row["title"]}','','{$row["author"]}','{$row["laiyuan"]}','$summary','{$row["countent"]}','{$row["public_time"]}','{$row["public_time"]}','','','1','1','1')",$conn);
}
}
}
?>