微信小说的爬取

需求:输入小说名字,然后到微信端去爬取所有小说的内容,保存到自己的数据库中

第一步:需要制作一个表单提交小说名字

第二步:后台将提交过来的小说名字接收过来拼接url地址,然后再获取小说的内容,通过正则匹配或者字符串截取就可以拿到想要的东西了,最后插入数据库

个人技术难点:取得数据之后,要求50条一批插入数据库,结果我不会,值得反思

解决办法:先声明一个临时数组,将数据表中的字段都用这个临时数组来保存,最后再声明一个$data[] 将临时数组赋值给$data[],再加一个索引,最后将$data插入数据库中

具体代码:


/**
 * 伪造IP
 * */
function fakeIp(){
   $binfo =array('Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; InfoPath.2; AskTbPTV/5.17.0.25589; Alexa Toolbar)','Mozilla/5.0 (Windows NT 5.1; rv:22.0) Gecko/20100101 Firefox/22.0','Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET4.0C; Alexa Toolbar)','Mozilla/4.0(compatible; MSIE 6.0; Windows NT 5.1; SV1)',$_SERVER['HTTP_USER_AGENT']);
   //123.125.68.*
   //125.90.88.*
   
   //定义伪造IP来源段,这里我找的是百度的IP地址
   //复制代码 代码如下:
   $cip = '14.111.58.'.mt_rand(0,254);
   $xip = '125.90.88.'.mt_rand(0,254);
   $header = array(
         'CLIENT-IP:'.$cip,
         'X-FORWARDED-FOR:'.$xip, 
         'Accept: text/html,application/xhtml+xml,application/xml搜索;q=0.9,*/*;q=0.8',
         );
   
   return $header;
}

/*获取网页内容*/

function getUrlContent($url,$header = null,$cookie = null, $is_https = 0){
   $ch = curl_init();
   //$timeout = 5;
   curl_setopt ($ch, CURLOPT_URL, "$url");
   if(!empty($header))
      curl_setopt ($ch, CURLOPT_HTTPHEADER, $header);
   curl_setopt ($ch, CURLOPT_REFERER, "zsxs.shu22.cn");
   curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
   curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
   curl_setopt ($ch, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.493.400 QQBrowser/9.0.2524.400");
   curl_setopt ($ch, CURLOPT_ENCODING, "gzip, deflate");  
   //curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
   
   if($is_https) {
      curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
      curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
   }
   
   if(!empty($cookie))
      curl_setopt($ch,CURLOPT_COOKIE,$cookie);
   $contents = curl_exec($ch);
   curl_close($ch);
   return $contents;
}


/*抓取小说*/

public function getquanwen1(){
//设置超时
        set_time_limit(0);
//        dump($_POST);die;

       $keywords = $_POST['kw'];
       //接收post传过来的值
       $class_id = $_POST['class_id'];
       $other_class_id = $_POST['other_class_id'];
        if (empty($class_id) || empty($other_class_id)) {
            $this->error('请选择小说分类');
        }
       $class_model = M("NovelClass");
       $class_name_arr = $class_model->where("class_id=$class_id")->select();
       //小说分类名字
       $class_name = $class_name_arr[0]['class_name'];
       //其他分类的名字
        $other_class_arr = $class_model->where("class_id=$other_class_id")->select();
        $other_class_name = $other_class_arr[0]['class_name'];
        //echo $other_class_name;die;

       //dump($keywords);
       //模拟发送一个get请求到最爽小说中去

        $url = 'http://liweih5.ikanshu.cn/searchbook.aspx?key='.$keywords.'&type=1';
        //echo $url;
        $header = fakeIp();
        //王溪的cookie
        $cookie = "ikanshuuser=userid=141760945&username=170619092244708337966005&channelid=3725&sessionid=hxkb4mcm4tdpopcvdpi41v4g&checkcode=27ad789c2e14a1286c1b28282876e5aa&version=1;
                    Hm_lpvt_d3a69bde0c513dc29f82d2f573aaceff=1497836439;
                    Hm_lvt_d3a69bde0c513dc29f82d2f573aaceff=1497835352;
                    ASP.NET_SessionId=hzppzb5m0ejj01dumpeqdvt2;
                    ikscnid=rndid=170619092308513978&channelid=3603; ";
        //刘熙的cookie;

        //获取网页的内容
        $content = getUrlContent($url, $header, $cookie, 1);
//        echo $content;die;

        //得到书的id
        $start = strpos($content,'<a href="/book/')+strlen('<a href="/book/');
        // a标签结束的位置和<div>开始的位置
        $end = strpos($content,'.html" class="zw_box');
        $book_id = substr($content,$start,$end-$start);

        //preg_match('|<a href="\/bookcatalog\/(.*?).html">|i',$content,$book_id);

        //抓取书的简介
        $book_url = 'http://liweih5.ikanshu.cn/book/'.$book_id.'.html';
        //web_from的链接地址保存到数据库
        $web_from = 'http://liweih5.ikanshu.cn/bookcatalog/'.$book_id.'.html';

        $book_content = getUrlContent($book_url, $header, $cookie, 1);//成功搜索到了小说对应名
        //书的简介
//        echo $book_content;die;

        //获取小说基本信息
        //获取书的名字开始的位置
        $name_start = strpos($book_content,'<h3>')+strlen('<h3>');
        $name_end = strpos($book_content,'</h3>');
        //获取小说名字
        $novel_name = substr($book_content,$name_start,$name_end-$name_start);

        //echo $keywords;echo '<br>';echo $novel_name;die;

        //判断是否是和搜索的小说一样
        if($keywords != $novel_name){
            $this->error('您搜索的小说不存在');
        }

        //获取作者
        preg_match('|<a href="\/authorinfo.aspx(.*?)>(.*?)<\/a>|i',$book_content,$novel_author);
        //echo $novel_author[2];

        //获取字数
        //字数开始的位置
        $zishu_start = strpos($book_content,'<p>字数')+strlen('<p>字数:');
        $zishu_end = strpos($book_content,'&nbsp;万字');
        //字数
        $novel_zishu = substr($book_content,$zishu_start,$zishu_end-$zishu_start);
        //echo $novel_zishu.'万字';

        //是否完结
        $end_start = strpos($book_content,'<p>状态')+strlen('<p>状态:');
//        echo $end_start;die;
        $is_end = substr($book_content,$end_start,6);
        if($is_end == '完本'){
            $is_end = 1;
        }else{
            $is_end = 0;
        }

        //获取图片信息
        preg_match('|<img src="(.*?)\.jpg|i',$book_content,$novel_img);
        $img_url = $novel_img[1].'.jpg';
        $img = getWebImg($img_url);
        $novel_base["img"] = $img["save_path"].$img["file_name"];
        $novel_base["mini_img"] = $img["save_path"].$img["file_name"];


        //描述信息
        //dump( $book_content);
        //preg_match('|<div class="subject-intro" id="uiMoreIntro">(.*?)<\/div>|i',$book_content,$novel_description);
        $novel_description_start = strpos($book_content,'<div class="subject-intro" id="uiMoreIntro">') + strlen('<div class="subject-intro" id="uiMoreIntro">');

        $novel_description_end = strpos($book_content,'</div>',$novel_description_start);
        //echo $novel_description_start; echo $novel_description_end;
        $novel_description = substr($book_content,$novel_description_start,$novel_description_end-$novel_description_start);


        //构造数据添加进数据库
        $novel_base['name'] = $keywords;
        $novel_base['description'] = isset($novel_description) ? $novel_description:' ';
        $novel_base['class_id'] = $class_id;
        $novel_base['class_name'] = $class_name;
        $novel_base['other_class_name'] = $other_class_name;
        $novel_base['other_class_id'] = $other_class_id;
        $novel_base['publish_time'] = date("Y-m-d H:i:s");
        $novel_base['word_count'] = $novel_zishu.'万字';
        $novel_base['is_end'] = $is_end;
        $novel_base['writer'] = $novel_author[2];
        $novel_base['from'] = '全本小说';
        $novel_base['web_from'] = $web_from;
        $novel_base['is_shelf'] = 0;

        //实行添加小说基本信息
        $novel_model = M('Novel');

        //判断小说名是否存在
        if ($novel_model->getByName($keywords) != null ) {
            $this->error('您搜索的小说已经存在');
        }

        //获取小说id 并同时插入数据库
        $novel_id = $novel_model->data($novel_base)->add();

        //得到所有目录
        preg_match('|<a href="\/bookcatalog\/(.*?).html">|i',$book_content,$catelog);
        $catalog_url = 'http://liweih5.ikanshu.cn/bookcatalog/'.$catelog[1].'.html';
        $catelog_content = getUrlContent($catalog_url, $header, $cookie, 1);

        //获取章节的链接地址
        preg_match_all('/\/book\/\d+\/\d+\.html/',$catelog_content,$lianjia);

        $data = array();
        $index = 0;

        //目录列表
        //dump($lianjia[0]) ;
        foreach($lianjia[0] as $key=>$item){
            //声明一个临时数组
            $tmp_arr = array();
            if($index >= 50){
                //dump($data);
                D('novel_chapter_add')->addAll($data);
                //清空数据
                $data = array();
                $index = 1;
            }

            //小说具体内容的链接
            preg_match('|\/(.*?)\/(.*?)\/(.*?)\.|i',$item,$zhangjieshu);

            //拼接链接地址
            $novel_address = 'http://liweih5.ikanshu.cn'.$item;
//            echo $novel_address;echo '<br/>';
//            continue;

            //抓取小说内容
            $chapter_content = getUrlContent($novel_address,$header,$cookie,1);
//            echo $chapter_content;die;

            //判断余额不足
            preg_match('|您的账户余额不足,请进入充值或选择下面的快捷充值|i',$chapter_content,$nomoney);
            //dump($nomoney);
            if(!empty($nomoney)) {
                //删除抓取的小说
                //echo '111';
                $novel_model->where('id='.$novel_id)->delete();
                D('novel_chapter_add')->where('novel_id='.$novel_id)->delete();
                //die('就是这里1444');
                $this->error('您的余额不足了,请及时充值');
            }

//            //判断是否章节重复
////            $map["novel_id"] = $novel_id;
////            $map["chapter"] = isset($zhangjieshu[3]) ? ''.$zhangjieshu[3].'' : '';
////            $count = D("novel_chapter")->where($map)->count();
////            //echo $count;die;
////            if($count > 0)
//                //continue;
//

            //获取标题
            $title_start = strpos($chapter_content,'<div class="r-chaptername" id="lbChapterName">')+strlen('<div class="r-chaptername" id="lbChapterName">');
            $title_end = strpos($chapter_content,'</div>',$title_start);
            $title_count = substr($chapter_content,$title_start,$title_end-$title_start);
//                echo $title_count;echo '<br>';
            if (strpos($title_count,'')) {
                $title_count_arr = explode(' ',$title_count);
                $title = $title_count_arr[1];
            }else{
                $title = $title_count;
                //echo $title;
            }

            //过滤特殊符号和不标准的字符串
            if(strpos($title,'hapter')){
                $title = ' ';
            }
            $tmp_arr['title'] = isset($title) ? $title : '';
            $tmp_arr['chapter'] = isset($zhangjieshu[3]) ? ''.$zhangjieshu[3].'' : '';

            //获取小说内容
            $novel_content_start = strpos($chapter_content,'<div class="r-content" id="uiContent">')+strlen('<div class="r-content" id="uiContent">');
            $novel_content_end = strpos($chapter_content,'</div>',$novel_content_start);
            $novel_content = substr($chapter_content,$novel_content_start,$novel_content_end - $novel_content_start);
            //替换标签
            $novel_content = '<p>'.str_replace('<br/>','</p><p>',$novel_content);

            //将小说内容装进$data中去
            $tmp_arr['content'] = $novel_content;
            $tmp_arr['order_set'] = $zhangjieshu[3];
            $tmp_arr["add_time"] = date("Y-m-d H:i:s");
            $tmp_arr['novel_name'] = $keywords;
            $tmp_arr['novel_id'] = $novel_id;
            $tmp_arr['novel_mini_img'] = $novel_base["img"];

            //将临时数组赋值给$data
            $data[] = $tmp_arr;

//            //自动增加索引
            $index++;
        }
        //循环外面
        if(!empty($data)){
            D("novel_chapter_add")->addAll($data);
        }
        //将从表(novel_chapter_add)数据同步到主表(novel_chapter)中去
        //将从表的数据查找出来
        $cong_data = D('novel_chapter_add')->field('chapter,title,content,novel_name,novel_id,novel_mini_img,order_set,add_time')->
        where('novel_id='.$novel_id)->select();
        //dump($cong_data);die;
        //dump($cong_data);die('3');
        //将所有数据插入主表中
        $zhu_data = D('novel_chapter')->addAll($cong_data);

        //echo $zhu_data;die('2');
        if(empty($zhu_data)){
            $this->error('同步失败');
        }
        //echo $novel_id;die('1');
        //删除从表的数据
        D('novel_chapter_add')->where('novel_id='.$novel_id)->delete();
       // echo $res2;die;
        $this->success('恭喜您,已经抓取完毕');
    }

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值