BeautifulSoup+ Find + Oracle爬取
不想存库的话直接用CSV做成表格数据也是可以的,方便了上班族
# 全国城市
import requests
from bs4 import BeautifulSoup
import cx_Oracle
import os
import uuid
os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHINA.UTF8'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} # 爬虫[Requests设置请求头Headers],伪造浏览器
# 核心爬取代码
conn = cx_Oracle.connect('username/password@127.0.0.1:1521/orcl')
print("数据库连接上了")
cursor_oracle = conn.cursor()
sql = "select PARAKEY from BASEDATA WHERE PARACODE=" + "'vagetable'"
cr = conn.cursor()
cr.execute(sql) # 执行sql语句
fenlei = []
rs = cr.fetchall() # 一次返回所有结果集
for (row,) in rs:
# 去掉括号,放入数组
fenlei.append(row)
print(fenlei)
num = 0
url = 'http://nong.gold600.com/cx?pz=';
# 山东URL
shanUrl = []
# 全国URL
allUrl = []
for k in fenlei:
# 这个是在for循环外面的,就是将列表的元素之间用&符号连接起来
aa = url + k + '&c=shandong'
cc = url + k
shanUrl.append(aa)
allUrl.append(cc)
for i in range(2, 11):
bb = aa + '&page=' + str(i)
dd = cc + '&page=' + str(i)
shanUrl.append(bb)
allUrl.append(dd)
# 循环全国打印url
for url in allUrl:
print(url)
response = requests.get(url, headers=headers) # 访问url
soup = BeautifulSoup(response.text, 'html.parser') # 获取网页源代码
# 获取数据
tr = soup.find('table', class_='bx').find_all('tr') # .find定位到所需数据位置 .find_all查找所有的tr(表格)
# 去除标签栏
for j in tr[1:]: # tr2[1:]遍历第1列到最后一列,表头为第0列
TTID = str(uuid.uuid1())
td = j.find_all('td') # td表格
Name = td[0].get_text().strip() # 遍历名称
Price = td[1].get_text().strip() # 遍历价格
AQI = td[2].get_text().strip() # 遍历单位
Market = td[3].get_text().strip() # 遍历批发市场
Timme = td[4].get_text().strip() # 遍历时间
Code = '0'
sql = "insert into PRODUCTPRICE(ID,NAME,PRICE,UNIT,AREA,INDATE,CODE) values ('" + TTID + "','" + Name + "','" + Price + "','" + AQI + "','" + Market + "','" + Timme + "','" + Code + "')"
cursor_oracle.execute(sql)
num = num + 1
if(num > 100):
conn.commit()
num = 0
# 循环山东打印url
for url in shanUrl:
print(url)
response = requests.get(url, headers=headers) # 访问url
soup = BeautifulSoup(response.text, 'html.parser') # 获取网页源代码
# 获取数据
tr = soup.find('table', class_='bx').find_all('tr') # .find定位到所需数据位置 .find_all查找所有的tr(表格)
# 去除标签栏
for j in tr[1:]: # tr2[1:]遍历第1列到最后一列,表头为第0列
TTID = str(uuid.uuid1())
td = j.find_all('td') # td表格
Name = td[0].get_text().strip() # 遍历名称
Price = td[1].get_text().strip() # 遍历价格
AQI = td[2].get_text().strip() # 遍历单位
Market = td[3].get_text().strip() # 遍历批发市场
Timme = td[4].get_text().strip() # 遍历时间
Code = '1'
sql = "insert into PRODUCTPRICE(ID,NAME,PRICE,UNIT,AREA,INDATE,CODE) values ('" + TTID + "','" + Name + "','" + Price + "','" + AQI + "','" + Market + "','" + Timme + "','" + Code + "')"
cursor_oracle.execute(sql)
num = num + 1
if(num > 100):
conn.commit()
num = 0
conn.commit()
cursor_oracle.close()
conn.close()