python爬虫学习第二周总结

最新推荐文章于 2022-12-02 22:52:36 发布

原创最新推荐文章于 2022-12-02 22:52:36 发布 · 479 阅读

0 ·

CC 4.0 BY-SA版权

python爬虫专栏收录该内容

11 篇文章

订阅专栏

本文介绍如何在PyCharm中配置并使用MongoDB进行数据存储与检索。通过实例演示了数据的插入与查询过程，涉及Python库如BeautifulSoup、Requests及pymongo的使用。

一、在pycharm中使用mongodb

打开pycharm，点击setting，如图所示，安装mongo插件

练习一：向数据库中存入数据

from bs4 import BeautifulSoup
import requests
import pymongo
import random   #引入random模块，是想每次让程序sleep()的时间变为随机数，而不是固定的几秒，random.randint(,)
import time
#如果数据已经存在数据库，重新创建项目，这三句代码需要写上
client = pymongo.MongoClient('localhost',27017)#在本地创建一个数据库，后面的数字是端口
test = client['test']#数据库名称
url_list = test['test']#数据库中的表，类似于excel表格中的sheet

def get_links_form(channel,pages,who_sells=0):
    # bj.58.com/pbdn/pn2/
    list_view = '{}{}/pn{}'.format(channel,str(who_sells),str(pages))
    wb_data = requests.get(list_view)
    time.sleep(random.randint(1,10))
    soup = BeautifulSoup(wb_data.text,'lxml')
    print(soup.prettify()) #prettify()是为了打印美观，方便查看
    # for link in soup.select('#jingzhun > tbody > tr > td.t.t_b > a'):
    #     item_link = link.get('href').split('?')[0]
    #     url_list.insert({'url':item_link})
    #     print(item_link)


get_links_form('http://bj.58.com/diandongche/',3)

练习二：把数据库中的数据取出来

#encoding = utf-8
import requests
import random
import time
from bs4 import BeautifulSoup
import pymongo

client = pymongo.MongoClient('localhost',27017)
ganji = client['ganji']
info_db = ganji['info']
link = ganji['link']

for url in link.find():#fand方法找出link表中的所有内容
    print(url['href'])
    req = requests.get(url['href'])
    req.encoding = 'utf-8'
    soup = BeautifulSoup(req.text,'lxml')
    a = soup.find_all('h1', class_='title-name')
    # b = soup.find_all('h1', class_='info_titile')
    if len(a) == 0:
        data = {
                'title':soup.select('h1.info_titile')[0].get_text(),
                'price':soup.select('span.price_now i')[0].get_text(),
                # 'pub_date':soup.select('.pr-5')[0].text.strip().split(';')[0],
                'area':soup.select('div.palce_li i')[0].get_text(),
                'looktime':soup.select('span.look_time')[0].get_text(),
                'url':url['href']
           }
        info_db.insert(data)
        print('yes')
    else:
        data = {
                'title':soup.select('h1.title-name')[0].get_text(),
                'price':soup.select('i.f22.fc-orange.f-type')[0].get_text(),
                'pub_date':soup.select('.pr-5')[0].text.strip().split(';')[0],
                'area':list(map(lambda x:x.text,soup.select('#wrapper > div.content.clearfix > div.leftBox > div:nth-of-type(2) > div > ul > li:nth-of-type(2) > a'))),
                'phone':soup.select('span.phoneNum-style')[0].get_text().strip(),
                'url':url['href']
           }
        info_db.insert(data)
        print('yes')

    # data = {
    #         'title':soup.select('h1.title-name')[0].get_text(),
    #         'price':soup.select('i.f22.fc-orange.f-type')[0].get_text(),
    #         'pub_date':soup.select('.pr-5')[0].text.strip().split(';')[0],
    #         'area':list(map(lambda x:x.text,soup.select('#wrapper > div.content.clearfix > div.leftBox > div:nth-of-type(2) > div > ul > li:nth-of-type(2) > a'))),
    #         'phone':soup.select('span.phoneNum-style')[0].text
    #    }
    # print(data)



##wrapper > div.content.clearfix > div.leftBox > div:nth-of-type(2) > div > ul > li:nth-of-type(2) > a
        #
        # data = {
        #     'title':soup.title.text.strip(),
        #     'price':soup.select('.f22.fc-orange.f-type')[0].text.strip(),
        #     'pub_date':soup.select('.pr-5')[0].text.strip().split(' ')[0],
        #     'area':list(map(lambda x:x.text,soup.select('ul.det-infor > li:nth-of-type(3) > a'))),
        #     'cates':list(soup.select('ul.det-infor > li:nth-of-type(1) > span')[0].stripped_strings),
        #     'url':url
        # }
# a = link_db.find().count()
# print(a)
# def get_link(mainlink):
#     # lasts = ['o{}/'.format(str(i)) for i in range(1,11)]
#     req  = requests.get(mainlink)
#     req.encoding = 'utf-8'
#     soup = BeautifulSoup(req.text,'lxml')
#     links = soup.select('td.t a')
#     for link in links:
#         url = link.get('href')
#         link_db.insert({'href':url})
#         print("yes")
#         time.sleep(random.randint(1,10))
#
#
# def get_main_link(start_url):
#     req = requests.get(start_url)
#     req.encoding = 'utf-8'
#     soup = BeautifulSoup(req.text,'lxml')
#     links = soup.select('div.main dt a')
#     for link in links:
#         Links.append(host + link.get('href'))
#     return links
#     # print(Links)
# if __name__ == '__main__':
#     start_url = 'http://suining.ganji.com/wu/'
#     host = 'http://suining.ganji.com'
#     Links = []
#     main_links = get_main_link(start_url)
#     # print(main_links)
#     for main_link in main_links:
#         mainlink = host + main_link.get('href')
#         print(mainlink)
#         get_link(mainlink)