作者: 共创联盟 加入时间: 2003-11-24 浏览次数: 322
抓取网页,并将文字和图片存入数据库中,利用getimg.php?id=读取数据库中的图片 getarticle.php?id=读取文档 <?
/**建表文档 articletype对应的类型 1:oracle,2:java,3:system CREATE TABLE article ( id int(6) NOT NULL auto_increment, title varchar(80) default NULL, content text, url varchar(80) default NULL, joindate varchar(12) default NULL, articletype int(2) not null, PRIMARY KEY (id) ) ; CREATE TABLE images ( id int(4) NOT NULL auto_increment, bin_data longblob, filetype varchar(50) default NULL, title varchar(50) default NULL, articleid int(6) NOT NULL, PRIMARY KEY (id) ) TYPE=MyISAM; */
class SaveWeb { var $title; var $url; var $typeid; var $content; var $getUrl = true; var $getimg = "getimg.php?id="; var $dbuser = "root"; var $dbpassword = "whf76128"; var $dbname = "tech"; var $dbhost = "127.0.0.1";
function SaveWeb($title,$url,$typeid) //初始化, { $this->title=$title; $this->url=$url; $this->typeid=$typeid; } function setContent($html) //初始化, { $this->content = $html; $this->getUrl = false; } function saveContent() //直接存储段落文字 { $date = gmdate("Y-m-d"); $data = nl2br($this->content); $data = addslashes($data); MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword); mysql_select_db( $this->dbname); $result=MYSQL_QUERY( "INSERT INTO article (title,content,url,joindate,articletype) VALUES ('$this->title','$data','$this->url','$date',$this->typeid)"); $id= mysql_insert_id(); MYSQL_CLOSE(); return $id; } function webSave() //存储页面 { if($this->title==""||$this->url=="") return false; if($this->getUrl==true) $text = $this->getHtml($this->url); else { $text = $this->content; } $text2 = $this->parserHtml($text); $id = $this->saveHtml($text2); $this->updateImgPID($id,$this->title); $this->delimg(); return $id; }
//在$strobj中查找$strchild,返回值为位置(找到)和false(没有找到相应的字符串). function strfind($strobj,$strchild,$int) { $intobj=strlen($strobj); $intchild=strlen($strchild);
while($int<=$intobj) { if(strtolower(substr($strobj,$int,1))==$strchild[0]) //当从$strobj上截取的首字符与$strchild的首字符相同时,作进一步判断. { if(strtolower(substr($strobj,$int,$intchild))==$strchild) return $int; } $int++; } return false; }
function getHtml($url) { if(($fp = fopen($url,"r"))==false) { echo "<font color=red>读取失败,文件位置:$url</font><br>"; return false; }
$data = ""; while(!feof($fp)) { $data = $data.fread($fp,512); } fclose($fp); return $data; } function delImg() { MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword); mysql_select_db( $this->dbname); $result=MYSQL_QUERY( "delete from images where articleid = 0"); MYSQL_CLOSE(); } function updateImgPID($id,$title) { MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword); mysql_select_db( $this->dbname); MYSQL_QUERY( "update images set articleid = $id where title='$title'"); MYSQL_CLOSE(); }
function saveHtml($data) { $date = gmdate("Y-m-d"); $data = addslashes($data); MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword); mysql_select_db( $this->dbname); $result=MYSQL_QUERY( "INSERT INTO article (title,content,url,joindate,articletype) VALUES ('$this->title','$data','$this->url','$date',$this->typeid)"); $id= mysql_insert_id(); MYSQL_CLOSE(); return $id; }
function saveImg($url) { $data = $this->getHtml($url); $data = addslashes($data); MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword); mysql_select_db( $this->dbname); $result=MYSQL_QUERY( "INSERT INTO images (bin_data,filetype,title,articleid) VALUES ('$data','".$this->getContentType($url)."','$this->title',0)"); $id= mysql_insert_id(); MYSQL_CLOSE(); return $id; }
function getContentName($inFileName) { return basename($inFileName); } function getContentType($inFileName) { //--剥去路径 $inFileName = basename($inFileName); //--检查文件扩展名 if(strrchr($inFileName, ".") == false) { return "application/octet-stream"; } //--得到文件扩展名,并判断文件类型 $extension = strrchr($inFileName, "."); switch($extension) { case ".gif": return "image/gif"; case ".gz": return "application/x-gzip"; case ".htm": return "text/html"; case ".html": return "text/html"; case ".jpg": return "image/jpeg"; case ".tar": return "application/x-tar"; case ".txt": return "text/plain"; case ".zip": return "application/zip"; case ".png": return "image/png"; case ".bmp": return "image/bmp"; default: return "application/octet-stream"; } return "application/octet-stream"; }
function parserHtml($text) { $int = 0; $baseUrl = parse_url($this->url); $urlHost = "http://".$baseUrl["host"]; $urlDir = $urlHost.dirname($baseUrl["path"]); $urlDir = str_replace("//","/",$urlDir); //更新<img>标签 while($int = $this->strfind($text,"<img",$int)) { $closeCharPos = $this->strfind($text,">",$int); $tmpTxt = substr($text,$int,$closeCharPos-$int+1); $srcStart = $this->strfind($tmpTxt,"src=",0); $srcEnd = 0; switch(substr($tmpTxt,$srcStart+4,1)) { case '"': $srcEnd = $this->strfind($tmpTxt,'"',$srcStart+5); $imgUrl = substr($tmpTxt,$srcStart+5,$srcEnd-$srcStart-5); break; case "'": $srcEnd = $this->strfind($tmpTxt,"'",$srcStart+5); $imgUrl = substr($tmpTxt,$srcStart+5,$srcEnd-$srcStart-5); break; default: $srcEnd = $this->strfind($tmpTxt," ",$srcStart+4); if($srcEnd == false) $srcEnd = $this->strfind($tmpTxt,'>',$srcStart+4); $imgUrl = substr($tmpTxt,$srcStart+4,$srcEnd-$srcStart-4); } $tempImgUrl = $imgUrl; $tempFile = parse_url($this->getimg);
if($this->strfind($tmpTxt,"http://",0)!=true) { switch(substr($imgUrl,0,1)) { case "/": $imgUrl = $urlHost.$imgUrl; break; default: if(substr($urlDir,strlen($urlDir)-1,1)=="/") $imgUrl = $urlDir.$imgUrl; else $imgUrl = $urlDir."/".$imgUrl; } }
if($this->strfind($imgUrl,$tempFile["path"],0)!=false) { $int++; continue; } $id = $this->saveImg($imgUrl); if($id == false) { $int++; continue; } $newImgUrl = $this->getimg.$id; $text = str_replace($tempImgUrl,$newImgUrl,$text); $int++; } $int = 0; //更新<a></a>标签 while($int = $this->strfind($text,"<a",$int)) { $closeCharPos = $this->strfind($text,">",$int); $tmpTxt = substr($text,$int,$closeCharPos-$int+1); $srcStart = $this->strfind($tmpTxt,"href=",0); $srcEnd = 0; switch(substr($tmpTxt,$srcStart+5,1)) { case '"': $srcEnd = $this->strfind($tmpTxt,'"',$srcStart+6); $imgUrl = substr($tmpTxt,$srcStart+6,$srcEnd-$srcStart-6); break; case "'": $srcEnd = $this->strfind($tmpTxt,"'",$srcStart+6); $imgUrl = substr($tmpTxt,$srcStart+6,$srcEnd-$srcStart-6); break; default: $srcEnd = $this->strfind($tmpTxt," ",$srcStart+5); if($srcEnd == false) $srcEnd = $this->strfind($tmpTxt,'>',$srcStart+5); $imgUrl = substr($tmpTxt,$srcStart+5,$srcEnd-$srcStart-5); } $tempImgUrl = $imgUrl; if($this->strfind($tmpTxt,"http://",0)!=true) { switch(substr($imgUrl,0,1)) { case "/": $imgUrl = $urlHost.$imgUrl; break; default: if(substr($urlDir,strlen($urlDir)-1,1)=="/") $imgUrl = $urlDir.$imgUrl; else $imgUrl = $urlDir."/".$imgUrl; } $text = str_replace($tempImgUrl,$imgUrl,$text); } $int++; } return $text; } }
?>
|