最近在做数据采集“说心里话,做起来很烦”; 以下是几个,做数据采集必不可缺少的函数,先贴出来希望对需要的人有所帮助,以后整理成类在系统的传上去,供参考: 函数1: //* * 获取指定范围的内容 * @param $str 给定的字符串信息(页面源码) * @param $start 截取开始参数 * @param $end 截取的结束参数 * @retuen $start ~ $end 之间的数据 */ function get_sub_content($str, $start, $end){ if ( $start == '' || $end == '' ){ return; } $str = explode($start, $str); $str = explode($end, $str[1]); return $str[0]; } 函数2: //* * 去掉源码中多余的标签 * $string 给定的字符串 */ function strip_html_script($string) { $return = ""; $replacestr = array("'<script/s+(.*)>([^~]+)'","'<//script>'","'<title>'","'([^~]+)</title>'","'<head>'","'([^~]+)</head>'","'<style>'","'([^~]+)</style>'"); $replace = array("","","","","","","",""); $return = preg_replace($replacestr,$replace,$string); return $return; } 函数3: //创建多层目录 function create_folder($path) { if (!file_exists($path)){ createFolder(dirname($path)); mkdir($path, 0777); } } 函数4: //获取指定模块内的所有 url 地址 function get_all_urls($code){ preg_match_all('/<a/s+href=["|/']?([^>"/' ]+)["|/']?/s*[^>]*>([^>]+)<//a>/i',$code,$arr); return array('name'=>$arr[2],'url'=>$arr[1]); }