pymongo.errors.DuplicateKeyError: E11000 duplicate key error collection: anjuke.ershoufang index

最新推荐文章于 2023-07-07 11:08:15 发布

原创最新推荐文章于 2023-07-07 11:08:15 发布 · 5k 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#MongoDB #DuplicateKeyError

python 同时被 2 个专栏收录

39 篇文章

订阅专栏

MongoDB

6 篇文章

订阅专栏

本文介绍了一段用于抓取安居客二手房信息的Python爬虫代码，并解决了因MongoDB插入数据时出现的问题。通过对代码进行简单调整，实现了稳定的数据抓取和存储。

这个bug忙了我一下午加一个晚上，终于把它ko掉了

先附上一段爬取安居客二手房信息的代码

import re
import time
import pymongo
import requests
from bson import ObjectId
from lxml import etree
from pprint import pprint
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
"cookie": "aQQ_ajkguid=243E5D58-8B13-D7BD-4922-3DE583E03855; ctid=11; _ga=GA1.2.1030980732.1530799904; _gid=GA1.2.506397644.1530799904; 58tj_uuid=c606f59a-2fb9-4c91-9815-741fdf9cfe5d; als=0; lps=http%3A%2F%2Fwww.anjuke.com%2F%3Fpi%3DPZ-baidu-pc-all-biaoti%7Chttps%3A%2F%2Fwww.baidu.com%2Fs%3Fie%3Dutf-8%26f%3D8%26rsv_bp%3D0%26rsv_idx%3D1%26tn%3Dbaidu%26wd%3D%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%26rsv_pq%3Dd71198bd000395ca%26rsv_t%3D6172VDlcx2zzRQ%252FLyCdcEidtafr%252BSvVyVXrlZ0lsK3U1MEz8066IF4byz4c%26rqlang%3Dcn%26rsv_enter%3D1%26rsv_sug3%3D5%26rsv_sug1%3D5%26rsv_sug7%3D101; twe=2; sessid=3497C1D2-43A8-6143-B2D7-CFDA33FF0C0E; new_uv=2; __xsptplus8=8.2.1530840314.1530840335.2%232%7Cwww.baidu.com%7C%7C%7C%25E5%25AE%2589%25E5%25B1%2585%25E5%25AE%25A2%7C%23%23Z7v3XnqLDcxTHeMLiqLXQSLHvXrh8k_R%23",
"referer": "https://shanghai.anjuke.com/?pi=PZ-baidu-pc-all-biaoti"
}

# 连接数据库
client = pymongo.MongoClient('127.0.0.1', 27017)
# 定义数据库名称
db = client.anjuke
# 定义表名
coll = db.ershoufang

def get_info():
count = 0
for i in range(23):

response = requests.get('https://shanghai.anjuke.com/sale/p{}/#filtersort'.format(i), headers=headers)

item = response.text

# print(item)
# 利用etree.HTML，将字符串解析为HTML文档
html = etree.HTML(item)
htmls = html.xpath('//*[@id="houselist-mod-new"]/li')
# print(htmls)

house = {}
for h in htmls:
h_addr = h.xpath('./div[2]/div[1]/a/text()')[0].strip()
h_type = h.xpath('./div[2]/div[2]/span[1]/text()')[0].strip()
h_area = h.xpath('./div[2]/div[2]/span[2]/text()')[0].strip()
h_hight = h.xpath('./div[2]/div[2]/span[3]/text()')[0].strip()
h_name = h.xpath('./div[2]/div[2]/span[4]/text()')[0].strip()
try:
h_youshi1 = h.xpath('./div[2]/div[4]/span[1]/text()')[0].strip()
except:
h_youshi1=None
try:
h_youshi2 = h.xpath('./div[2]/div[4]/span[2]/text()')[0].strip()
except:
h_youshi2=None
try:
h_youshi3 = h.xpath('./div[2]/div[4]/span[3]/text()')[0].strip()
except:
h_youshi3=None
h_price = h.xpath('./div[3]/span[1]/strong/text()')[0].strip()

house['h_addr']=h_addr
house['h_type']=h_type
house['h_area']=h_area
house['h_hight']=h_hight
house['h_name']=h_name
house['h_youshi1']=h_youshi1
house['h_youshi2']=h_youshi2
house['h_youshi3']=h_youshi3
house['h_price']=h_price
# pprint(house)
time.sleep(0.01)

# coll.insert(house)
save(house)
count+=1
print(count)

def save(house):

coll.insert(house)

def main():
get_info()

if __name__ == '__main__':