import urllib
import urllib2
import HTMLParser
from bs4 import BeautifulSoup
import re
import MySQLdb as mdb
import json
i=1 #number order of companys
def GetOnePageUrl(url):
global i
flag = 0
request = urllib2.Request(url)
html = urllib2.urlopen(request)
soup = BeautifulSoup(html, "lxml")
for link in soup.find_all(name='a', attrs={"href": re.compile(r'^http://qy.58.com/mq/[0-9]*/$')}):
#print link.get('href')
if flag%2 == 0:
GetOneUrlInfo(link.get('href'))
print i
i += 1
flag += 1
def GetOneUrlInfo(url):
global i
request = urllib2.Request(url)
html = urllib2.urlopen(request)
soup = BeautifulSoup(html,"lxml")
#for addr in soup.find_all(name='td',limit=5):
# print addr.string
fiveinfo = soup.find_all(name='td',limit=5)
if len(fiveinfo) == 0: #the company's link go to website which made by itself
return
co_name = fiveinfo[0].string
co_type = fiveinfo[1].string
co_numpeople = fiveinfo[2].string
co_manager= fiveinfo[
【第一个爬虫】python爬取58同城企业信息并插入数据库
最新推荐文章于 2020-08-19 19:06:18 发布