<?php /** * 数据采集公共类文件 * */ class grab_ { /** * 抓取指定范围内容 * @parm $str 内容 * @parm $start 截取开始 * @parm $end 截取结束 */ public function get_sub_content($str, $start, $end){ if ( $start == '' || $end == '' ){ return; } $str = explode($start, $str); $str = explode($end, $str[1]); return $str[0]; } //获取<h2> 之间内容 public function edition($edition) { preg_match_all('/<h2/s(.*)>(.*)<//h2>/i',$edition,$arr); return array('name'=>$arr[2]); } //获取 <h3> 之间内容 public function editionh3($edition) { preg_match_all('/<h3/s(.*)>(.*)<//h3>/i',$edition,$arr); return array('name'=>$arr[2]); } //获取指定模块内<li>标记的url 连接 public function get_all_url($code){ preg_match_all('/<li><a/s+href=["|/']?([^>"/' ]+)["|/']?/s*[^>]*>([^>]+)<//a><//li>/i',$code,$arr); return array('name'=>$arr[2],'url'=>$arr[1]); } //获取指定模块内的所有 url 连接 public function get_all_urls($code){ preg_match_all('/<a/s+href=["|/']?([^>"/' ]+)["|/']?/s*[^>]*>([^>]+)<//a>/i',$code,$arr); return array('name'=>$arr[2],'url'=>$arr[1]); } //获取指定模块内 <img> 标签的 src 值 public function get_all_img($code) { $pattern = '/<img([^//>]+)src="([^"]+)"([^//>]*)//?>/'; preg_match_all($pattern,$code, $matches); return array('imgsrc' => $matches[2]); } //获取指定模块内 href 值 public function get_all_media($code) { $pattern = '/<a[^>]*href/="([^"]+)"([^//>]*)//?"/'; preg_match_all($pattern,$code, $matches); return array('href' => $matches[1]); } //获取媒体文件 value 值 public function get_mtwj_url($code) { $pattern = '/<param[^>]*name="URL"/s+value="([^"]+)"([^//>]*)//?>/'; preg_match_all($pattern,$code, $matches); return $matches[1]; } //获取指定模块内 htm 页面 public function get_all_mediahtm($code) { $pattern = '/[a-zA-Z0-9//_-]+/.htm/'; preg_match_all($pattern,$code, $matches); return array('href' => $matches[0]); } //获取文件名后缀 public function file_ext($filename) { return strtolower(trim(substr(strrchr($filename, '.'), 1))); } //创建多层目录 public function create_folder($path) { if (!file_exists($path)){ createFolder(dirname($path)); mkdir($path, 0777); } } //编码转换 public function gbk_to_utf8($string) { if(!is_array($string)) return mb_convert_encoding($string,'gb2312','UTF-8'); foreach($string as $key => $val) $string[$key] = gbk_to_utf8($val); return $string; } //html 使用反斜线引用字符串 public function sqladdslashes($string, $force = 0) { if(!$GLOBALS['magic_quotes_gpc'] || $force) { if(is_array($string)) { foreach($string as $key => $val) { $string[$key] = sqladdslashes($val, $force); } } else { $string = addslashes($string); } } return $string; } //搜索源码中的 英文单引号替换为中文双引号 public function strip_html_table($string){ $involve = '’'; $return = str_replace("'",$involve,$string); return $return; } //将部分中文单引号替换回原来的英文单引号 public function strip_html_quote($string){ $involve = "'"; $return = str_replace("’",$involve,$string); return $return; } //去掉源码中的正则匹配的标签 public function strip_html_script($string) { $return = ""; $replacestr = array("'<script/s+(.*)>([^~]+)'","'<//script>'","'<title>'","'([^~]+)</title>'","'<head>'","'([^~]+)</head>'","'<mce:style><!-- '","'([^~]+) --></mce:style><style mce_bogus="1">'","'([^~]+)</style>'"); $replace = array("","","","","","","",""); $return = preg_replace($replacestr,$replace,$string); return $return; } //去掉html 标签 public function StripHTML($string) { $string=preg_replace("#/s{1,}#is",' ',$string); $pattern[0]="'<script[^>]*?>.*?</script>'si"; $pattern[1]="'<style[^>]*?>.*?</style>'si"; $pattern[2]="'<[///!]*?[^<>]*?>'si"; $pattern[3]="'([/r/n])[/s]+'"; $pattern[4]="'&(quot|#34);'i"; $pattern[5]="'&(amp|#38);'i"; $pattern[6]="'&(lt|#60);'i"; $pattern[7]="'&(gt|#62);'i"; $pattern[8]="'&(nbsp|#160);'i"; $pattern[9]="'&(iexcl|#161);'i"; $pattern[10]="'&(cent|#162);'i"; $pattern[11]="'&(pound|#163);'i"; $pattern[12]="'&(copy|#169);'i"; $pattern[13]="'&#(/d+);'e"; $pattern[14]="'<.*?>'i"; $pattern[15]="'<'i"; $pattern[16]="'/.+'i"; $pattern[17]="'/'i"; $pattern[18]="'&.+'i"; $pattern[19]="'&'i"; $replace[0]=""; $replace[1]=""; $replace[2]=""; $replace[3]="//1"; $replace[4]=""; $replace[5]="&"; $replace[6]="<"; $replace[7]=">"; $replace[8]=" "; $replace[9]=chr(161); $replace[10]=chr(162); $replace[11]=chr(163); $replace[12]=chr(169); $replace[13]="chr(//1)"; $replace[14]=""; $replace[15]=""; $replace[16]=""; $replace[17]=""; $replace[18]=""; $replace[19]=""; return preg_replace ($pattern, $replace, $string); } } ?>