注意:运行本程序,请正确配置mongodb数据库
#coding:utf-8
import requests,pymongo,threading,logging,time
from bs4 import BeautifulSoup
class Spider:
def __init__(self):
client=pymongo.MongoClient(host='localhost')
db=client.price
self.collection=db.result
def get_all_pages(self):
base_url="http://www.100ppi.com/mprice/plist-1-505-"
for i in range(1,11):
new_url=base_url+str(i)+'.\html'
self.getPage(new_url)
def getPage(self,url):
r=requests.get(url)
r.encoding="utf-8"
soup=BeautifulSoup(r.text,'lxml')
s1=soup.select('.lp-table tr')
s2=s1[1:]
for tr in s2:
result={}
result['商品名称']=tr.select('.p-name a')[0].text
result['规格']