python爬虫笔记(三)

本文介绍了一种使用Python爬取小说网站数据并将其存入MySQL数据库的方法。通过BeautifulSoup解析HTML,获取书名、作者等信息,并利用urllib2发送HTTP请求。最终实现了批量插入数据库的功能。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

爬虫结果入库图:



代码如下:

#!/user/bin/python
# -*- coding: UTF-8 -*-

import urllib
import urllib2
import lxml
import MySQLdb
from bs4 import BeautifulSoup

import httplib
httplib.HTTPConnection._http_vsn = 10
httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'

user_agent = '''Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) 
                AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36''' 
hdr = { 'User-Agent' : user_agent }

db = MySQLdb.connect(host="localhost", port=3306, user="root", passwd="123456", db="xiaoshuo", charset="utf8")
cursor = db.cursor()

str_sql = '''INSERT INTO `xiaoshuo`.`book1` (`bookName`, `author`, `url`, `classifyName`, `brief`, `updateTime`,
            `status`) VALUES(%s, %s, %s, %s, %s, %s, %s)'''

def getBookInfoBaseOnUrl(url, param):

    request = urllib2.Request(url, headers=hdr)
    response = urllib2.urlopen(request)
    html_data = response.read().decode('gbk')

    soup = BeautifulSoup(html_data,'lxml')
    mylist = soup.select('head')

    for item in mylist: 
        bookName = item.find(property="og:novel:book_name").get("content").encode('utf-8')
        #print "书名:", bookName

        author = item.find(property="og:novel:author").get("content").encode('utf-8')
        #print "作者:", author

        url = item.find(property="og:novel:read_url").get("content").encode('utf-8')
        #print "链接:", url

        classifyName = item.find(property="og:novel:category").get("content").encode('utf-8')
        #print "类型:", classifyName

        description = item.find(property="og:description").get("content").encode('utf-8')
        #print "brief:", description

        updateTime = item.find(property="og:novel:update_time").get("content").encode('utf-8')
        #print "更新时间:", updateTime

        status = item.find(property="og:novel:status").get("content").encode('utf-8')
        #print "status:", status

        #str_sql += '("' + bookName + '", "' + author + '", "' + url + '", "' + classifyName + '", "' + description + '", "' + updateTime + '", "' + status + '"),'

        tup1 = (str(bookName), str(author), str(url), str(classifyName), str(description), str(updateTime), str(status))
        param.append(tup1)
        #print "-----------------------------------------------------------------------------------------"

def getBookList(url):

    request = urllib2.Request(url, headers=hdr)
    response = urllib2.urlopen(request)
    html_data = response.read().decode('gbk')

    soup = BeautifulSoup(html_data,'lxml')
    mylist = soup.find_all('div', class_ ='r')
    for item in mylist:

        param=[]
        xiaoshuo_list = item.find_all('li')

        for item in xiaoshuo_list:

            tmp = item.get_text('|').split('|', 1)
            bookName = tmp[0].encode('utf-8')
            author =  tmp[1].encode('utf-8')
            url = item.find('a').get('href').encode('utf-8')
            #print "bookName:", bookName, "author:", author, "url:", url
            
            str_sql1 = "select * from xiaoshuo.book1 where bookName = '" + str(bookName) + "'"
            print str_sql1
            cursor.execute(str_sql1)
            bookInfo = cursor.fetchall()

            if len(bookInfo) == 1:
                #print "the have in db:", bookInfo
                print "*****************************************************************************************"
                continue
            else:
                getBookInfoBaseOnUrl(url, param)
                print "*****************************************************************************************"

        print "param:", str(param).decode('string_escape')

        try:
            cursor.executemany(str_sql, param)
            db.commit()
        except MySQLdb.Error, e:  
            sqlError =  "Error:%s" % str(e) 
            print "sqlError:", sqlError

def startGetChatper():

    soup = BeautifulSoup(open('biquge.html'),'lxml')
    mylist = soup.find_all('div', class_ ='nav')
    for item in mylist:
        #print item
        xiaoshuo_list = item.find_all('li')
        for item in xiaoshuo_list:
            print item
            url = item.find('a').get('href')
            classifyName = item.find('a').get_text().encode('utf-8')
            print "url:", url , type(url)
            if url != '/':
                getBookList("http://www.biquzi.com/" + url)
            print "#########################################################################################"

if __name__ == "__main__":
    print ("<<<-----Start Get Book INFO And Save Db------>>")
    startGetChatper()

    cursor.close()
    db.close()




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值