抓取微信公众号文章及本地化

最新推荐文章于 2025-06-19 11:15:31 发布

a329958_1

最新推荐文章于 2025-06-19 11:15:31 发布

阅读量6.9k

点赞数

CC 4.0 BY-SA版权

分类专栏： php 文章标签： php 微信

本文链接：https://blog.youkuaiyun.com/a329958_1/article/details/77839902

php 专栏收录该内容

4 篇文章

订阅专栏

最近单位需要我做个做个可以抓取微信公众号文章的功能，说实话本人一个菜鸟，居然要完成这样的工作，真的太难了。本来想说做个爬虫就好了，但是微信公众号文章的抓取通常是到搜狗微信搜索上去抓取，结果他的反爬虫机制让我这个菜鸟无从下手。后来在“伟大”的百度帮助下，我找到了现成的抓取爬虫。不过是python写的。解决了抓取问题，接下来就是本地化的问题。我用了PHP的file_get_contents()函数，将临时链接的文章信息全部保存为字符串信息保存在数据库中，这样就不担心链接过时问题了。
接下来上代码，抓取的爬虫链接在此

http://blog.youkuaiyun.com/niuxiaojia09/article/details/55260770

Controller层

 //根据文章临时地址将文章本地化存入数据库
    public function actionGetimg()
    {
        $query = \Yii::$app->db;
        $reulrt = $query->createCommand("SELECT * FROM `wenzhang_info` ;")->queryAll();

       foreach($reulrt as $key){
          $date_time=strtotime($key['date_time']);
            $model = new wenzhang_bendi_static_();
            $model->resetPartitionIndex($key['_id']);
            $model->id= $key['_id'];
            $model->title = html_entity_decode($key['title']);
            $model->mpid = $key['mp_id'];
            $model->cover_url = $key['cover_url'];
            $model->date_time = $key['date_time'];
            $model->date_unix = $date_time;
            $model->msg_index = $key['msg_index'];

           //根据临时链接获得网页的源代码
            $html = file_get_contents($key['content_url']);
            //去除<!--headTrap<body></body><head></head><html></html>-->
           $html = str_replace("<!--headTrap<body></body><head></head><html></html>-->", "", $html);
           //去除多余重复的空格和<p>标签
            $html = preg_replace(array('/\s*(<br\s*\/?\s*>\s*){2,}/im','/(<p>(\s|\s*<br\s*\/?\s*>\s*)*<\/p>)+/im','/(<p>(\s|\s*<span\s*\/?\s*>\s*)*<\/p>)+/im'),array('\\1\\1',''), $html);
          //微信图片的跳板
           $href='http://pic.visionbang.com/?url=';
           //讲图片的data-src里的链接换成src的链接
            $info=preg_replace('/<img.+?data-src=\"(.+?)\"/i','<img src="'.$href.'\1"  ',$html);
            //去除JS代码，以免视频无法显示
            $str=preg_replace('/<(script.*?)>(.*?)<(\/script.*?)>/si', " " ,$info);
            //转换视频链接方便显示
            $str=preg_replace('/<iframe.*?data-src=".+?vid=(.+?)&(.+?)&(.+?)&(.+?)".*?>/','<iframe src="https://v.qq.com/iframe/player.html?vid=\1&\4" style="width:100%;height:auto;display:block;overflow:hidden;"scrolling="no"frameborder="0">" ' ,$str);
          $time=date('Ymd',$date_time);
            $times=date('d',$date_time);
            //创建文件夹
            if(!is_dir("html/$time")){
                mkdir("html/$time");
            }
            //创建文件夹
            if(!is_dir("html/$time/$times")){
                mkdir("html/$time/$times");
            }
            //路径保存
            $path='html/'.$time.'/'.$times.'/'.$key['_id'].'.html';
            //创建静态文件
            $f=fopen($path,'w');
            fwrite($f,$str);
            $y=fclose($f);
            //保存路径
            $model->article_info=$path;
            $model->insert();
       }

        //删除抓取来的文章内容
       $query->createCommand(" delete from wenzhang_info ")->execute();

        return $this->render('image');
    }
}