北京新闻爬虫-final


# coding=utf-8
# 地区 1:北京,2上海,3,黑龙江,4,重庆,5,海南。
# 环境:1,相关,0,不相关
# 食品:1,相关,0,不相关
# 情感::1,正面,-1,负面,0,中性
# import webdriver from selenium
# 要安装 pip install selenium
import re
import os
# yum install nodejs   -->  npm install -g cnpm --registry=https://registry.npm.taobao.org
# npm install phantomjs-prebuilt@2.1.14 --ignore-scripts -> pip install phantomjs
from selenium import webdriver
import numpy as np
import pandas as pd
def city_crawl(url,path):
        # load PhantomJS driver
        cur_url =url
        # 在ubuntu  下 which phantomjs
        driver = webdriver.PhantomJS('/usr/bin/phantomjs')
        # set window size, better to fit the whole page in order to
        # avoid dynamically loading data
        driver.set_window_size(1280, 2400)  # optional
        # data page content
        driver.get(cur_url)
        # use page_source to get html content
        content = driver.page_source
        print content

        # driver.find_element_by_class_name()

        # data_time=re.findall('http:\/\/(.+\.){1,}163.com\/\d{2}\/\d{4}\/\d{2}\/.+\.html',content)
        # print data_time
        from bs4 import BeautifulSoup

        soup = BeautifulSoup(content, 'lxml')
        urls = []
        news_content = []

        for link in soup.select('div.aslide > a'):
            urls.append(link.get('href'))
            news_content.append(link.text.strip())
        print urls

        for i in news_content:
            print i
        print len(news_content)

        for link in soup.select('div.na_detail  > div.news_title > h3 > a'):
            urls.append(link.get('href'))
            news_content.append(link.text)
        print urls
        for i in news_content:
            print i
        print len(news_content)
        print len(news_content)

        # 第三部分,读取制定网页的内容:
        from bs4 import BeautifulSoup
        import requests
        import time


        def read_page(urls, content):
            for url in urls:
                wb_data = requests.get(url)
                soup = BeautifulSoup(wb_data.text, 'lxml')
                cache = []

                for word in soup.select("div.post_text > p "):
                    cache.append(word.text.strip())
                sentence = ''
                for i in cache:
                    sentence = sentence + i
                    # print sentence
                content.append(sentence)
            return content


        content = []
        contents = read_page(urls, content)
        print contents
        for i in contents:
            print i

        # 第四步,导出成文本文件。
        import pandas as pd
        import numpy as np

        news_time = []
        for i in urls:
            a = i.split('/')
            b = '20' + a[3] + a[4]
            # print a[2], b
            news_time.append(b)
        # a=np.array(urls).reshape(-1,1)
        # b=np.array(contents).reshape(-1,1)
        # print np.hstack((a,b))
        #
        # bj['topic']=urls
        # bj['content']=contents
        # pd.to_csv('./1.csv')
        a = np.array(news_time).reshape(-1, 1)
        b = np.array(news_content).reshape(-1, 1)
        c = np.array(contents).reshape(-1, 1)

        final0 = np.hstack((a, b, c))
        final1 = pd.DataFrame(final0)
        final1["地区"] = 1
        final1['环境'] = 0
        final1['食品'] = 0
        final1['情感'] = 0

        # 第五步,保证输出结果正确。
        return final1



beijing="http://bj.news.163.com/"
l1=city_crawl(beijing,"beijing.csv")
shanghai="http://sh.news.163.com/"
l2=city_crawl(shanghai,"shanghai.csv")
hainan="http://hn.news.163.com/"
l3=city_crawl(hainan,"hainan.csv")
heilongjiang="http://hlj.news.163.com/"
l4=city_crawl(heilongjiang,"heilongjiang.csv")
chongqing="http://chongqing.163.com/"
l5=city_crawl(chongqing,"chognqing.csv")
ll=np.concatenate([l1,l2,l3,l4,l5])
ll=pd.DataFrame(ll)












ll=ll.drop_duplicates()
import sys

reload(sys)
sys.setdefaultencoding('utf-8')
#ll: 从网上抓取的
#lll:存在的
lll=pd.read_csv("./ll2.csv",encoding="utf-8")
# print ll.shape
# print final1.shape
lll=lll.iloc[:,1:]
ll=np.concatenate([lll,ll],axis=0)
ll=pd.DataFrame(ll)
ll=ll.drop_duplicates()


# ll.to_csv("./%s"%(path))
ll.to_csv("./ll2.csv",encoding="utf-8")
print "wancheng"


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值