"""
Created on Fri Jan 19 18:58:41 2018
人民网新闻爬虫
@author: gzs10227
"""
import sys
stderr = sys.stderr
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf8')
sys.stderr = stderr
sys.stdout = stdout
import urllib2,urllib
urllib.getproxies_registry = lambda: {}
import requests
from lxml import etree
import re,time,datetime
import os
base_path = u'C:/Users/gzs10227/Desktop/廖庆豪'
TYPE_DICT = {}
def open_url(url):
time.sleep(0.5)
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
response = requests.get(url,headers = headers)
response.encoding = 'utf-8'
return response.content
url = 'http://www.people.com.cn/'
html = open_url(url)
web_data = etree.HTML(html)
links = web_data.xpath(r'//div[@class="w1000"]//span/a/@href')
types = web_data.xpath(r'//div[@class="w1000"]//span/a/text()')
for i in range(len(links)):
link = links[i]
key = link.replace('http://','').replace('.people.com.cn/','')
result_path = '%s/%s' % (base_path, types[i])
if not os.path.exists(result_path):
os.makedirs(result_path)
TYPE_DICT[key] = types[i]
def get_newtype_href(link):
print link
html = open_url(link)
type_hrefs = re.findall(re.compile(r'href="(.*?)"'),html)
newtype = link.replace('http://','').replace('.people.com.cn/','')
type_hrefs = [i for i in type_hrefs if newtype in i and 'css' not in i and link != i]
index_hrefs = list(set([i for i in type_hrefs if i.endswith('index.html')]))
content_urls = list(set(type_hrefs) - set(index_hrefs))
for url in index_hrefs:
print url
html = open_url(link)
curls = re.findall(re.compile(r'href="(.*?)"'),html)
clear_url = []
for c in curls:
if 'n1' not in c or 'css' in c:
continue
else:
if c.startswith('/n1'):
c = link[:-1] + c
clear_url.append(c)
else:
clear_url.append(c)
content_urls.extend(clear_url)
content_urls = list(set(content_urls))
return content_urls
links = [i for i in links if 'renshi' not in i and 'news' not in i]
result_link = map(get_newtype_href,links)
result_links = []
for i in result_link:
for j in i:
if j.endswith('.html') and j.startswith('http') and 'n1' in j:
result_links.append(j.replace(' ','').replace(' ','').replace('\t',''))
result_links = list(set(result_links))
def get_content(href):
key = re.findall(re.compile(r'http://(.*?).people.com'),href)[0]
html = open_url(href)
web_data = etree.HTML(html)
print TYPE_DICT[key]
result_path = '%s/%s/' % (base_path, TYPE_DICT[key])
try:
title = web_data.xpath('//div[@class="clearfix w1000_320 text_title"]//h1/text()')[0]
content = web_data.xpath('//div[@class="box_con"]//p//text()')
contents = ''
for c in content:
contents = contents + c
except:
title = ''
contents = ''
print title
filename = str(int(time.time() * 1000)) + '.txt'
with open(result_path + filename,'w') as f:
f.writelines(title)
f.writelines(contents)
map(get_content,result_links)
毕业设计之数据获取【人民网】
最新推荐文章于 2024-12-07 03:53:56 发布