【Python真的很强大】使用scrapy爬取百度贴吧-上海吧

本文介绍了一种使用Scrapy框架抓取百度贴吧特定吧(以上海吧为例)的主题帖及评论的方法,并将数据存储到MySQL数据库中。通过定义爬虫类、设置全局变量和解析数据,实现了对指定时间段内贴吧内容的有效抓取。

      需求是这样的: 需要获取最近20天的贴吧的主题贴以及直接回复(过滤回复的回复),输出数据到MySQL 这里以百度贴吧-上海吧为例子。

     

 

      上海吧的结构如下,主题帖和回复都带有分页。如下所示:

 

subject

 

 

post

定义全局变量(settings.py):

   

# -*- coding: utf-8 -*-

# Scrapy settings for tieba project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#

BOT_NAME = 'tieba'

SPIDER_MODULES = ['tieba.spiders']
NEWSPIDER_MODULE = 'tieba.spiders'

START_URL  = 'http://tieba.baidu.com/f?ie=utf-8&kw=%E4%B8%8A%E6%B5%B7'
#START_URL  = 'http://tieba.baidu.com/f?ie=utf-8&kw=%E4%B8%8A%E6%B5%B7'
#START_URL = 'http://tieba.baidu.com/f?ie=utf-8&kw=%E4%B8%8A%E6%B5%B7%E4%BA%A4%E9%80%9A%E5%A4%A7%E5%AD%A6'
TOTAL_DAYS = "20"

ITEM_PIPELINES = ['tieba.pipelines.MySQLDBPipeline']

MySQL_SERVER = "localhost"
MySQL_SERVER_PORT = 3306
MySQL_SERVER_DB = "tieba"
MySQL_SERVER_USER = "mysql"
MySQL_SERVER_PWD = "xyz"



# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; rv:35.0) Gecko/20100101 Firefox/35.0'

 

数据抓取部分(TiebaSpider.py 只完成主题帖,回复内容还未准备) :

#coding=utf-8
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor 
from scrapy.selector import HtmlXPathSelector
from tieba.items import SubjectItem
from tieba.items import CommentItem
from tieba import settings
import scrapy
import json

class TiebaSpider(CrawlSpider):
    name = 'tieba'
    allowed_domains = ['tieba.baidu.com'] #备注:那些带有推广的帖子现在看起来都不是这个域名下的,所以主题文章已经过滤了推广贴
    start_urls = [settings.START_URL]
    #这里假设20天内主题帖数量<1000*50,可以根据实际调整或获取页面上每个主题帖的时间来计算出具体需要多少页!
    for x in range(0, 1000):
        start_urls.append(settings.START_URL + "&pn=" + str((x+1) * 50))
    rules = [Rule(LinkExtractor(allow=['/p/\d+']), 'parse_subject_shanghai')]#这里只解析主题贴
    
    
    def parse_subject_shanghai(self, response):
        try:
            torrent = SubjectItem()
            torrent['url'] = response.url
            torrent['id'] = response.url.split('/p')[1].split('/')[1].split('?')[0]
            torrent['commentNum'] = response.xpath("//*[@id='thread_theme_5']/div[1]/ul/li[2]/span[1]/text()").extract()[0]
            #这里用id定位没有找到content,一个可能原因是用了自定义tag cc
            torrent['content'] = response.xpath("//*/cc/div/text()").extract()[0]
            dataField = json.loads(str(response.xpath("//*[@id='j_p_postlist']/div[1]/@data-field").extract()[0]))
            #很多信息在html source里没有,是在客户端用 js 生成
            torrent['created'] = dataField['content']['date'].strip()+":00"
            torrent['title'] = response.xpath("//*[@id='j_core_title_wrap']/div/h1/text()").extract()[0]
            torrent['tiebaName'] = response.xpath("//*[@id='container']/div/div[1]/div[2]/div[2]/a/text()").extract()[0].strip()
            torrent['authorName'] = response.xpath("//*[@id='j_p_postlist']/div[1]/div[2]/ul/li[3]/a/text()").extract()[0]
            torrent['authorUrl'] = response.xpath("//*[@id='j_p_postlist']/div[1]/div[2]/ul/li[3]/a/@href").extract()[0]
            torrent['authorAvatar'] = response.xpath("//*[@id='j_p_postlist']/div[1]/div[2]/ul/li[1]/div/a/img/@src").extract()[0]
            if not "http://tieba.baidu.com" in torrent['authorUrl']:
                torrent['authorUrl'] = "http://tieba.baidu.com" + torrent['authorUrl']
            
            hxs = HtmlXPathSelector(response)
            subject_post_div = hxs.select("//*/cc/div")[0]
            imgs = ['','',''] 
            index = 1
            for img in subject_post_div.select(".//img/@src"):
                if index > 3:
                    break
                imgs[index-1] = img.extract()
                index += 1
            torrent['image1'],torrent['image2'],torrent['image3'] = imgs
            #到这里已经完成主题帖的解析
            
            totalCommentPage =  int(response.xpath("//div[@id='thread_theme_5']/div[1]/ul/li[2]/span[2]/text()").extract()[0])
            for x in range(2, totalCommentPage):
                url = torrent['url'] + ("?pn=%s"  % x)
                yield scrapy.Request(url=url, callback=self.parse_comments_shanghai)
            
        except:
            torrent['id'] = None
            pass
        yield torrent
        
        
    def parse_comments_shanghai(self,response):
        try:
            items = []
            print response
            hxs = HtmlXPathSelector(response)
            print "---------------------------------------------------"
            j_p_postlist = hxs.select("//div[@id='j_p_postlist']").select(".//div[@class='l_post l_post_bright ']")
            print "----------------------------------------got it",j_p_postlist
            for childNode in j_p_postlist:
                print ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>"
                print childNode.extract()
            #for content in j_p_postlist.select(".//div[@id='l_post l_post_bright']/text()"):
                #print '=-===content',content
        except:
            for item in items:
                item['id'] = None
            pass
        return items

 

数据存取部分(只完成主题帖)

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

import settings
from scrapy import log
import traceback
import MySQLdb
import MySQLdb.cursors
from twisted.enterprise import adbapi
from datetime import datetime

def strtodatetime(datestr,format):      
    return datetime.strptime(datestr,format)  

class MySQLDBPipeline(object):
    def __init__(self):
        self.date_time_format = "%Y-%m-%d %H:%M:%S"
        self.dbpool = adbapi.ConnectionPool('MySQLdb',
                    host = settings.MySQL_SERVER,
                    db = settings.MySQL_SERVER_DB,
                    port = settings.MySQL_SERVER_PORT,
                    user = settings.MySQL_SERVER_USER,
                    passwd = settings.MySQL_SERVER_PWD,
                    cp_reconnect = True,
                    cursorclass = MySQLdb.cursors.DictCursor,
                    charset = 'utf8',
                    use_unicode = True) 

    def process_item(self, item, spider):
        # run db query in thread pool
        query = self.dbpool.runInteraction(self._conditional_insert, item).addErrback(self.handle_error)
        return item
    
    def _conditional_insert(self, tx, item):
            if item.get('id') and item.get('created'):
                today = datetime.now()
                postDay = strtodatetime(item.get('created'), self.date_time_format)
                #从这里限制只更新20天内的数据
                if (today - postDay).days <= int(settings.TOTAL_DAYS):
                    args= (item['id'],
                     item['title'],
                     item['url'],
                     item['tiebaName'],
                     item['authorName'],
                     item['authorUrl'],
                     item['authorAvatar'],
                     item['content'],
                     item['created'],
                     item['image1'],
                     item['image2'],
                     item['image3'],  
                     item['commentNum'],
                     item['commentNum']
                     )
                                    
                    sql = '''insert into tieba_articles(id, title, url, tiebaName, authorName, authorUrl, authorAvatar,content,created,image1,image2,image3,commentNum)  
                          VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s', '%s') ON DUPLICATE KEY UPDATE commentNum = '%s'
                          ''' % args
                
                    tx.execute(sql)
     
    def handle_error(self, e):
        log.err(e)    

 

忘记了,补上数据结构部分(items.py):

 

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy
class SubjectItem(scrapy.Item):
    id = scrapy.Field()
    url = scrapy.Field()
    title = scrapy.Field()
    tiebaName = scrapy.Field()
    authorName = scrapy.Field()
    authorUrl = scrapy.Field()
    authorAvatar = scrapy.Field()
    commentNum = scrapy.Field()
    created = scrapy.Field()
    content = scrapy.Field()
    image1 = scrapy.Field()
    image2 = scrapy.Field()
    image3 = scrapy.Field()
    
    
    
class CommentItem(scrapy.Item):
    authorName = scrapy.Field()
    authorUrl = scrapy.Field()
    authorAvatar = scrapy.Field()
    content = scrapy.Field()
    index = scrapy.Field()
    article_id = scrapy.Field()
    created = scrapy.Field()

 

 

总结: scrapy定义了清晰的类层次结构,使得开发者只需要关注业务逻辑本身。 对于分页数据处理,可以使用两种模式: 1)把已知的所有url添加到一个列表; 2)使用yield scrapy.Request(xargs)

数据解析可以使用自身的Xpath,也可以选用其他第三方module. 如BeautifulSoup.

### 使用Scrapy框架爬取百度贴吧评论 为了使用Scrapy框架来爬取百度贴吧的评论,需要创建一个新的Scrapy项目并编写相应的Spider脚本。以下是具体实现方法: #### 创建Scrapy项目 首先,在终端中执行命令以初始化新的Scrapy项目: ```bash scrapy startproject baidu_tieba_crawler ``` 这将在当前目录下生成名为`baidu_tieba_crawler`的新文件夹。 #### 编写Spider类 进入项目的spiders子目录,并在此处创建一个Python模块用于定义具体的抓取逻辑。例如可以命名为`tiebacomment_spider.py`: ```python import scrapy from ..items import BaidutiebaCrawlerItem class TieBaCommentSpider(scrapy.Spider): name = "tieba_comments" allowed_domains = ["tieba.baidu.com"] start_urls = ['http://tieba.baidu.com/p/{post_id}'.format(post_id='帖子ID')] def parse(self, response): item = BaidutiebaCrawlerItem() comment_list = response.xpath('//div[@id="j_p_postlist"]/div') for each_comment in comment_list: author = each_comment.xpath('.//li[@class="d_name"]//text()').get().strip() content = ''.join(each_comment.xpath('.//cc/div/text()').extract()).strip() item['author'] = author item['content'] = content yield item next_page_url = response.css('a.next::attr(href)').get() if next_page_url is not None: yield response.follow(next_page_url, callback=parse) ``` 上述代码片段展示了如何通过XPath解析HTML页面中的数据[^1]。这里假设已经有一个包含作者名称和评论正文的选择器路径;实际应用时可能需要根据目标网站的具体结构调整这些选择器表达式。 #### 设置项配置 编辑位于根目录下的`settings.py`文件,设置一些必要的参数如USER_AGENT模拟浏览器访问行为以及启用ITEM_PIPELINES保存提取的数据到数据库或其他存储介质中。 ```python BOT_NAME = 'baidu_tieba_crawler' SPIDER_MODULES = ['baidu_tieba_crawler.spiders'] NEWSPIDER_MODULE = 'baidu_tieba_crawler.spiders' ROBOTSTXT_OBEY = False DOWNLOAD_DELAY = 3 RANDOMIZE_DOWNLOAD_DELAY=True COOKIES_ENABLED=False DEFAULT_REQUEST_HEADERS={ ':authority': 'tieba.baidu.com', ':method': 'GET', ':scheme': 'https' } ITEM_PIPELINES = { 'baidu_tieba_crawler.pipelines.BaidutiebaPipeline': 300, } ``` #### 定义Items对象 在`items.py`里声明想要获取的信息字段,比如用户名、发布时间戳等。 ```python import scrapy class BaidutiebaCrawlerItem(scrapy.Item): # define the fields for your item here like: author = scrapy.Field() content = scrapy.Field() ``` 完成以上步骤之后就可以利用命令行工具启动这个爬虫程序了: ```bash cd path/to/baidu_tieba_crawler/ scrapy crawl tieba_comments -o output.json ``` 此操作会将所有收集到的结果导出成JSON格式存放在同级目录下的output.json文件内[^2]。
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值