毕业设计之数据获取【人民网】

最新推荐文章于 2024-12-07 03:53:56 发布

原创最新推荐文章于 2024-12-07 03:53:56 发布 · 968 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#爬虫 #毕业设计 #数据

python爬虫那些坑专栏收录该内容

19 篇文章

订阅专栏

本文介绍了一个针对人民网的新闻爬虫实现案例。该爬虫能够抓取网站上的新闻链接及内容，并将其保存到本地文件中。通过分析网页结构并利用Python的requests库和lxml库实现了高效的数据抓取。

"""
Created on Fri Jan 19 18:58:41 2018
人民网新闻爬虫
@author: gzs10227
"""

import sys
stderr = sys.stderr
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf8')
sys.stderr = stderr
sys.stdout = stdout
import urllib2,urllib
urllib.getproxies_registry = lambda: {}
import requests
from lxml import etree
import re,time,datetime
import os

base_path = u'C:/Users/gzs10227/Desktop/廖庆豪'
TYPE_DICT = {}

def open_url(url):
    time.sleep(0.5)
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
    response = requests.get(url,headers = headers)
    response.encoding = 'utf-8'    
    return response.content


url = 'http://www.people.com.cn/'
html = open_url(url)
web_data = etree.HTML(html)
links = web_data.xpath(r'//div[@class="w1000"]//span/a/@href')
types = web_data.xpath(r'//div[@class="w1000"]//span/a/text()')

for i in range(len(links)):
    link = links[i]
    key = link.replace('http://','').replace('.people.com.cn/','')   
    result_path = '%s/%s' % (base_path, types[i])
    if not os.path.exists(result_path):
        os.makedirs(result_path)
    TYPE_DICT[key] = types[i]


def get_newtype_href(link):
    print link
    html = open_url(link)
    type_hrefs = re.findall(re.compile(r'href="(.*?)"'),html)
    newtype = link.replace('http://','').replace('.people.com.cn/','')
    type_hrefs = [i for i in type_hrefs if newtype in i and 'css' not in i and link != i]
    index_hrefs = list(set([i for i in type_hrefs if i.endswith('index.html')]))
    content_urls = list(set(type_hrefs) - set(index_hrefs))
    for url in index_hrefs:
        print url
        html = open_url(link)
        curls = re.findall(re.compile(r'href="(.*?)"'),html)
        clear_url = []
        for c in curls:
            if 'n1' not in c or 'css' in c:
                continue
            else:
                if c.startswith('/n1'):
                    c = link[:-1] + c
                    clear_url.append(c)                    
                else:
                    clear_url.append(c)
    content_urls.extend(clear_url)  
    content_urls = list(set(content_urls))              
    return content_urls

links = [i for i in links if 'renshi' not in i and 'news' not in i]
result_link = map(get_newtype_href,links)

result_links = []
for i in result_link:
    for j in i:
        if j.endswith('.html') and j.startswith('http') and 'n1' in j:
            result_links.append(j.replace('	 ','').replace(' ','').replace('\t',''))
result_links = list(set(result_links))


def get_content(href):
    key = re.findall(re.compile(r'http://(.*?).people.com'),href)[0]
    html = open_url(href)   
    web_data = etree.HTML(html)
    print TYPE_DICT[key]
    result_path = '%s/%s/' % (base_path, TYPE_DICT[key])
    try:
        title = web_data.xpath('//div[@class="clearfix w1000_320 text_title"]//h1/text()')[0]
        content = web_data.xpath('//div[@class="box_con"]//p//text()')
        contents = ''
        for c in content:
            contents = contents + c
    except:
        title = ''
        contents = ''
    print title
    filename = str(int(time.time() * 1000)) + '.txt'
    with open(result_path + filename,'w') as f:
        f.writelines(title)
        f.writelines(contents)
        
map(get_content,result_links)