知识点:
1,写爬虫不是一蹴而就的,要会单点测试
2,正则表达式不一定能够一级抓取,要学会二级抓取
3,urllib 只能抓一些普通的网站
作业:抓取51job的北上广深的Python岗位薪资情况
# encoding:utf-8
import selenium
from selenium import webdriver #模拟浏览器
import re
def getnumbername(number):
url = "https://search.51job.com/list/0"+number+"0000,000000,0000,00,9,99,python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
driver = selenium.webdriver.Firefox() # 调用火狐浏览器
driver.get(url)
pagesource = driver.page_source # 抓取网页源代码
restr = """<span class="t4">([\s\S]*?)</span>"""
regex = re.compile(restr, re.IGNORECASE)
mylist = regex.findall(pagesource)
for newstr in mylist:
new_1=newstr.strip()
print new_1
new_2=new_1[:-3]
yes=new_2.split("-")
#print yes[0],yes[1]不好使
for ye in yes:
print ye
driver.close()
return mylist[0]
#pythonlist=["1","2","3"]
#city=["北京","上海","深圳"]
i=0
pythonlist=["1"]
city=["北京"]
for pystr in pythonlist:
print city[i]
getnumbername(pystr)
i+=1