一、在pycharm中使用mongodb
打开pycharm,点击setting,如图所示,安装mongo插件
练习一:向数据库中存入数据
from bs4 import BeautifulSoup
import requests
import pymongo
import random #引入random模块,是想每次让程序sleep()的时间变为随机数,而不是固定的几秒,random.randint(,)
import time
#如果数据已经存在数据库,重新创建项目,这三句代码需要写上
client = pymongo.MongoClient('localhost',27017)#在本地创建一个数据库,后面的数字是端口
test = client['test']#数据库名称
url_list = test['test']#数据库中的表,类似于excel表格中的sheet
def get_links_form(channel,pages,who_sells=0):
# bj.58.com/pbdn/pn2/
list_view = '{}{}/pn{}'.format(channel,str(who_sells),str(pages))
wb_data = requests.get(list_view)
time.sleep(random.randint(1,10))
soup = BeautifulSoup(wb_data.text,'lxml')
print(soup.prettify()) #prettify()是为了打印美观,方便查看
# for link in soup.select('#jingzhun > tbody > tr > td.t.t_b > a'):
# item_link = link.get('href').split('?')[0]
# url_list.insert({'url':item_link})
# print(item_link)
get_links_form('http://bj.58.com/diandongche/',3)
练习二:把数据库中的数据取出来
#encoding = utf-8
import requests
import random
import time
from bs4 import BeautifulSoup
import pymongo
client = pymongo.MongoClient('localhost',27017)
ganji = client['ganji']
info_db = ganji['info']
link = ganji['link']
for url in link.find():#fand方法找出link表中的所有内容
print(url['href'])
req = requests.get(url['href'])
req.encoding = 'utf-8'
soup = BeautifulSoup(req.text,'lxml')
a = soup.find_all('h1', class_='title-name')
# b = soup.find_all('h1', class_='info_titile')
if len(a) == 0:
data = {
'title':soup.select('h1.info_titile')[0].get_text(),
'price':soup.select('span.price_now i')[0].get_text(),
# 'pub_date':soup.select('.pr-5')[0].text.strip().split(';')[0],
'area':soup.select('div.palce_li i')[0].get_text(),
'looktime':soup.select('span.look_time')[0].get_text(),
'url':url['href']
}
info_db.insert(data)
print('yes')
else:
data = {
'title':soup.select('h1.title-name')[0].get_text(),
'price':soup.select('i.f22.fc-orange.f-type')[0].get_text(),
'pub_date':soup.select('.pr-5')[0].text.strip().split(';')[0],
'area':list(map(lambda x:x.text,soup.select('#wrapper > div.content.clearfix > div.leftBox > div:nth-of-type(2) > div > ul > li:nth-of-type(2) > a'))),
'phone':soup.select('span.phoneNum-style')[0].get_text().strip(),
'url':url['href']
}
info_db.insert(data)
print('yes')
# data = {
# 'title':soup.select('h1.title-name')[0].get_text(),
# 'price':soup.select('i.f22.fc-orange.f-type')[0].get_text(),
# 'pub_date':soup.select('.pr-5')[0].text.strip().split(';')[0],
# 'area':list(map(lambda x:x.text,soup.select('#wrapper > div.content.clearfix > div.leftBox > div:nth-of-type(2) > div > ul > li:nth-of-type(2) > a'))),
# 'phone':soup.select('span.phoneNum-style')[0].text
# }
# print(data)
##wrapper > div.content.clearfix > div.leftBox > div:nth-of-type(2) > div > ul > li:nth-of-type(2) > a
#
# data = {
# 'title':soup.title.text.strip(),
# 'price':soup.select('.f22.fc-orange.f-type')[0].text.strip(),
# 'pub_date':soup.select('.pr-5')[0].text.strip().split(' ')[0],
# 'area':list(map(lambda x:x.text,soup.select('ul.det-infor > li:nth-of-type(3) > a'))),
# 'cates':list(soup.select('ul.det-infor > li:nth-of-type(1) > span')[0].stripped_strings),
# 'url':url
# }
# a = link_db.find().count()
# print(a)
# def get_link(mainlink):
# # lasts = ['o{}/'.format(str(i)) for i in range(1,11)]
# req = requests.get(mainlink)
# req.encoding = 'utf-8'
# soup = BeautifulSoup(req.text,'lxml')
# links = soup.select('td.t a')
# for link in links:
# url = link.get('href')
# link_db.insert({'href':url})
# print("yes")
# time.sleep(random.randint(1,10))
#
#
# def get_main_link(start_url):
# req = requests.get(start_url)
# req.encoding = 'utf-8'
# soup = BeautifulSoup(req.text,'lxml')
# links = soup.select('div.main dt a')
# for link in links:
# Links.append(host + link.get('href'))
# return links
# # print(Links)
# if __name__ == '__main__':
# start_url = 'http://suining.ganji.com/wu/'
# host = 'http://suining.ganji.com'
# Links = []
# main_links = get_main_link(start_url)
# # print(main_links)
# for main_link in main_links:
# mainlink = host + main_link.get('href')
# print(mainlink)
# get_link(mainlink)