# coding=utf-8
# 地区 1:北京,2上海,3,黑龙江,4,重庆,5,海南。
# 环境:1,相关,0,不相关
# 食品:1,相关,0,不相关
# 情感::1,正面,-1,负面,0,中性
# import webdriver from selenium
# 要安装 pip install selenium
import re
import os
# yum install nodejs --> npm install -g cnpm --registry=https://registry.npm.taobao.org
# npm install phantomjs-prebuilt@2.1.14 --ignore-scripts -> pip install phantomjs
from selenium import webdriver
import numpy as np
import pandas as pd
def city_crawl(url,path):
# load PhantomJS driver
cur_url =url
# 在ubuntu 下 which phantomjs
driver = webdriver.PhantomJS('/usr/bin/phantomjs')
# set window size, better to fit the whole page in order to
# avoid dynamically loading data
driver.set_window_size(1280, 2400) # optional
# data page content
driver.get(cur_url)
# use page_source to get html content
content = driver.page_source
print content
# driver.find_element_by_class_name()
# data_time=re.findall('http:\/\/(.+\.){1,}163.com\/\d{2}\/\d{4}\/\d{2}\/.+\.html',content)
# print data_time
from bs4 import BeautifulSoup
soup = BeautifulSoup(content, 'lxml')
urls = []
news_content = []
for link in soup.select('div.aslide > a'):
urls.append(link.get('href'))
news_content.append(link.text.strip())
print urls
for i in news_content:
print i
print len(news_content)
for link in soup.select('div.na_detail > div.news_title > h3 > a'):
urls.append(link.get('href'))
news_content.append(link.text)
print urls
for i in news_content:
print i
print len(news_content)
print len(news_content)
# 第三部分,读取制定网页的内容:
from bs4 import BeautifulSoup
import requests
import time
def read_page(urls, content):
for url in urls:
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text, 'lxml')
cache = []
for word in soup.select("div.post_text > p "):
cache.append(word.text.strip())
sentence = ''
for i in cache:
sentence = sentence + i
# print sentence
content.append(sentence)
return content
content = []
contents = read_page(urls, content)
print contents
for i in contents:
print i
# 第四步,导出成文本文件。
import pandas as pd
import numpy as np
news_time = []
for i in urls:
a = i.split('/')
b = '20' + a[3] + a[4]
# print a[2], b
news_time.append(b)
# a=np.array(urls).reshape(-1,1)
# b=np.array(contents).reshape(-1,1)
# print np.hstack((a,b))
#
# bj['topic']=urls
# bj['content']=contents
# pd.to_csv('./1.csv')
a = np.array(news_time).reshape(-1, 1)
b = np.array(news_content).reshape(-1, 1)
c = np.array(contents).reshape(-1, 1)
final0 = np.hstack((a, b, c))
final1 = pd.DataFrame(final0)
final1["地区"] = 1
final1['环境'] = 0
final1['食品'] = 0
final1['情感'] = 0
# 第五步,保证输出结果正确。
return final1
beijing="http://bj.news.163.com/"
l1=city_crawl(beijing,"beijing.csv")
shanghai="http://sh.news.163.com/"
l2=city_crawl(shanghai,"shanghai.csv")
hainan="http://hn.news.163.com/"
l3=city_crawl(hainan,"hainan.csv")
heilongjiang="http://hlj.news.163.com/"
l4=city_crawl(heilongjiang,"heilongjiang.csv")
chongqing="http://chongqing.163.com/"
l5=city_crawl(chongqing,"chognqing.csv")
ll=np.concatenate([l1,l2,l3,l4,l5])
ll=pd.DataFrame(ll)
ll=ll.drop_duplicates()
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
#ll: 从网上抓取的
#lll:存在的
lll=pd.read_csv("./ll2.csv",encoding="utf-8")
# print ll.shape
# print final1.shape
lll=lll.iloc[:,1:]
ll=np.concatenate([lll,ll],axis=0)
ll=pd.DataFrame(ll)
ll=ll.drop_duplicates()
# ll.to_csv("./%s"%(path))
ll.to_csv("./ll2.csv",encoding="utf-8")
print "wancheng"